Background

The first step in this project is to understand the distribution and relationship between the words, tokens, and phrases in the text. This report will create a word frequency table, find the 1-gram, 2-gram and 3-gram term document matrix and conduct the exploratory analysis of the words.

Data Preparation

The training data is downloaded from the link below:
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The zip file contains the followings:

1.en_US.blogs.txt
2.en_US.news.txt
3.en_US.twitter.txt

First, loading the packages that will be used in the report:

library(tm)
library(NLP)
library(ggplot2)
library(RWeka)
library(ngram)
library(slam)

Read in original data files and check the file size:

## Read in Original Datasets
setwd("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/")
twitter <- readLines(con <- file("en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
news <- readLines(con <- file("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
blogs <- readLines(con <- file("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)


## Check number of lines

twitter_len <- length(twitter)
news_len <- length(news)
blogs_len <- length(blogs)

## Check file size
twitter_size <- file.size("en_US.twitter.txt")/1024^2
news_size <- file.size("en_US.news.txt")/1024^2
blogs_size <- file.size("en_US.blogs.txt")/1024^2


summary <- data.frame(c(twitter_len,news_len,blogs_len),c(twitter_size,news_size,blogs_size))

colnames(summary)[1] <- c('Number of Length')
colnames(summary)[2] <- c('File Size(MB)')

summary
##   Number of Length File Size(MB)
## 1          2360148      159.3641
## 2            77259      196.2775
## 3           899288      200.4242

The amount of data in original dataset is huge, thus a 10% sub sample is used in the following analysis. Samples are saved in separate txt file for future repeated use.

# Choosen sample size 
sample_size <- 0.1

# Creating subsets
twitter_index <- sample(seq_len(length(twitter)),length(twitter)*sample_size)
news_index <- sample(seq_len(length(news)),length(news)*sample_size)
blogs_index <- sample(seq_len(length(blogs)),length(blogs)*sample_size)

twitter_sub <- twitter[twitter_index[]]
writeLines(twitter_sub, con="twitter_sub.txt", "\n")
news_sub <- news[news_index[]]
writeLines(news_sub, con="news_sub.txt", "\n")
blogs_sub <- blogs[blogs_index[]]
writeLines(blogs_sub, con="blogs_sub.txt", "\n")

Create Corpus and Clean Data

Create corpus and clean the corpus by removing profanity words and special symbols like punctuations and numbers. Profanity words are downloaded from:https://www.cs.cmu.edu/~biglou/resources/

##Read in Sub-sample txt files for twitter, news and blogs
corpus.folder <- "C:/Users/i58197/Desktop/Sub/"
corpus <- VCorpus(DirSource(corpus.folder,encoding="UTF-8"))


#Remove punctuation
corpus <- tm_map(corpus,removePunctuation)
#Remove numbers
corpus <- tm_map(corpus,removeNumbers)
#Remove whitespaces
corpus <- tm_map(corpus,stripWhitespace)
#Remove profanity words
profanity <- readLines("C:/Users/i58197/Desktop/bad_words.txt")
corpus <- tm_map(corpus,removeWords, profanity)
#Remove non-ASCII
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
corpus<-tm_map(corpus,content_transformer(removeNonASCII)) 


#Convert to lowercase
corpus <- tm_map(corpus,content_transformer(tolower))

Build Term Document Matrix

Now we can analyze the word frequency in 1-gram, 2-gram and 3-gram categories:

1-Gram

##1-gram
corpus_tdm_1 <- TermDocumentMatrix(corpus)
wordMatrix_1 <- as.data.frame((as.matrix(corpus_tdm_1)) ) 
wordSort_1 <- sort(rowSums(wordMatrix_1),decreasing=TRUE)
result_1 <- data.frame(word = names(wordSort_1),freq=wordSort_1)
top_30_result_1 <- result_1[1:30,]
top_30_result_1
##        word   freq
## the     the 294960
## and     and 159416
## you     you  85658
## for     for  77885
## that   that  72505
## with   with  47765
## this   this  43159
## was     was  41486
## have   have  39716
## are     are  36174
## but     but  33937
## not     not  30597
## your   your  27441
## all     all  26739
## just   just  25329
## from   from  24361
## its     its  24160
## out     out  22898
## like   like  22739
## what   what  22676
## they   they  21894
## will   will  21552
## one     one  21263
## about about  21241
## when   when  19342
## can     can  19306
## get     get  18763
## time   time  16712
## more   more  16256
## there there  15861

Plot top 30 Frequent 1-Gram

ggplot(data=top_30_result_1, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") +  guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))

2-Gram

##2-gram
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus_tdm_2 <- TermDocumentMatrix(corpus,control = list(tokenize = bigram))
wordMatrix_2 <- as.data.frame((as.matrix(corpus_tdm_2)) ) 
wordSort_2 <- sort(rowSums(wordMatrix_2),decreasing=TRUE)
result_2 <- data.frame(word = names(wordSort_2),freq=wordSort_2)
top_30_result_2 <- result_2[1:30,]
top_30_result_2
##              word  freq
## of the     of the 25910
## in the     in the 24764
## for the   for the 13833
## to the     to the 13831
## on the     on the 12967
## to be       to be 11841
## at the     at the  8874
## i have     i have  7938
## and the   and the  7828
## i was       i was  7669
## is a         is a  7566
## in a         in a  7269
## i am         i am  7193
## and i       and i  7088
## it was     it was  6911
## it is       it is  6766
## for a       for a  6594
## with the with the  6575
## if you     if you  6519
## going to going to  6060
## have a     have a  5969
## is the     is the  5617
## will be   will be  5523
## to get     to get  5502
## from the from the  5287
## i dont     i dont  5132
## that i     that i  5097
## want to   want to  5082
## one of     one of  4992
## with a     with a  4930

Plot top 30 Frequent 2-Gram

ggplot(data=top_30_result_2, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") +  guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))

3-Gram

##3-gram
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus_tdm_3 <- TermDocumentMatrix(corpus,control = list(tokenize = trigram))
wordMatrix_3 <- as.data.frame((as.matrix(corpus_tdm_3)) ) 
wordSort_3 <- sort(rowSums(wordMatrix_3),decreasing=TRUE)
result_3 <- data.frame(word = names(wordSort_3),freq=wordSort_3)
top_30_result_3 <- result_3[1:30,]
top_30_result_3
##                                  word freq
## thanks for the         thanks for the 2330
## one of the                 one of the 2129
## a lot of                     a lot of 1933
## i want to                   i want to 1323
## to be a                       to be a 1291
## going to be               going to be 1245
## it was a                     it was a 1041
## cant wait to             cant wait to 1040
## i have a                     i have a 1036
## i have to                   i have to 1021
## looking forward to looking forward to 1018
## thank you for           thank you for  995
## i dont know               i dont know  969
## be able to                 be able to  961
## out of the                 out of the  952
## the end of                 the end of  942
## i love you                 i love you  941
## i need to                   i need to  913
## im going to               im going to  912
## the rest of               the rest of  881
## some of the               some of the  869
## as well as                 as well as  812
## for the follow         for the follow  806
## one of my                   one of my  805
## you want to               you want to  786
## this is a                   this is a  764
## is going to               is going to  736
## a couple of               a couple of  729
## to go to                     to go to  720
## the fact that           the fact that  707

Plot top 30 Frequent 3-Gram

ggplot(data=top_30_result_3, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") +  guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))

Next Step

For the final analysis, text modelling, and text prediction, we need to do the following studies:

1.N-Gram modelling of the full text data sets
2.Optimize model for low memory utilization and faster processing.
3.Implement model as a Shiny App