Data Science Specialization Capstone Week2

Background

The first step in this project is to understand the distribution and relationship between the words, tokens, and phrases in the text. This report will create a word frequency table, find the 1-gram, 2-gram and 3-gram term document matrix and conduct the exploratory analysis of the words.

Data Preparation

The training data is downloaded from the link below:
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The zip file contains the followings:

1.en_US.blogs.txt
2.en_US.news.txt
3.en_US.twitter.txt

First, loading the packages that will be used in the report:

library(tm)
library(NLP)
library(ggplot2)
library(RWeka)
library(ngram)
library(slam)

Read in original data files and check the file size:

## Read in Original Datasets
setwd("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/")
twitter <- readLines(con <- file("en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
news <- readLines(con <- file("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
blogs <- readLines(con <- file("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)


## Check number of lines

twitter_len <- length(twitter)
news_len <- length(news)
blogs_len <- length(blogs)

## Check file size
twitter_size <- file.size("en_US.twitter.txt")/1024^2
news_size <- file.size("en_US.news.txt")/1024^2
blogs_size <- file.size("en_US.blogs.txt")/1024^2


summary <- data.frame(c(twitter_len,news_len,blogs_len),c(twitter_size,news_size,blogs_size))

colnames(summary)[1] <- c('Number of Length')
colnames(summary)[2] <- c('File Size(MB)')

summary

##   Number of Length File Size(MB)
## 1          2360148      159.3641
## 2            77259      196.2775
## 3           899288      200.4242

The amount of data in original dataset is huge, thus a 10% sub sample is used in the following analysis. Samples are saved in separate txt file for future repeated use.

# Choosen sample size 
sample_size <- 0.1

# Creating subsets
twitter_index <- sample(seq_len(length(twitter)),length(twitter)*sample_size)
news_index <- sample(seq_len(length(news)),length(news)*sample_size)
blogs_index <- sample(seq_len(length(blogs)),length(blogs)*sample_size)

twitter_sub <- twitter[twitter_index[]]
writeLines(twitter_sub, con="twitter_sub.txt", "\n")
news_sub <- news[news_index[]]
writeLines(news_sub, con="news_sub.txt", "\n")
blogs_sub <- blogs[blogs_index[]]
writeLines(blogs_sub, con="blogs_sub.txt", "\n")

Create Corpus and Clean Data

Create corpus and clean the corpus by removing profanity words and special symbols like punctuations and numbers. Profanity words are downloaded from:https://www.cs.cmu.edu/~biglou/resources/

##Read in Sub-sample txt files for twitter, news and blogs
corpus.folder <- "C:/Users/i58197/Desktop/Sub/"
corpus <- VCorpus(DirSource(corpus.folder,encoding="UTF-8"))


#Remove punctuation
corpus <- tm_map(corpus,removePunctuation)
#Remove numbers
corpus <- tm_map(corpus,removeNumbers)
#Remove whitespaces
corpus <- tm_map(corpus,stripWhitespace)
#Remove profanity words
profanity <- readLines("C:/Users/i58197/Desktop/bad_words.txt")
corpus <- tm_map(corpus,removeWords, profanity)
#Remove non-ASCII
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
corpus<-tm_map(corpus,content_transformer(removeNonASCII)) 


#Convert to lowercase
corpus <- tm_map(corpus,content_transformer(tolower))

Build Term Document Matrix

Now we can analyze the word frequency in 1-gram, 2-gram and 3-gram categories:

1-Gram

##1-gram
corpus_tdm_1 <- TermDocumentMatrix(corpus)
wordMatrix_1 <- as.data.frame((as.matrix(corpus_tdm_1)) ) 
wordSort_1 <- sort(rowSums(wordMatrix_1),decreasing=TRUE)
result_1 <- data.frame(word = names(wordSort_1),freq=wordSort_1)
top_30_result_1 <- result_1[1:30,]
top_30_result_1

##        word   freq
## the     the 294960
## and     and 159416
## you     you  85658
## for     for  77885
## that   that  72505
## with   with  47765
## this   this  43159
## was     was  41486
## have   have  39716
## are     are  36174
## but     but  33937
## not     not  30597
## your   your  27441
## all     all  26739
## just   just  25329
## from   from  24361
## its     its  24160
## out     out  22898
## like   like  22739
## what   what  22676
## they   they  21894
## will   will  21552
## one     one  21263
## about about  21241
## when   when  19342
## can     can  19306
## get     get  18763
## time   time  16712
## more   more  16256
## there there  15861

Plot top 30 Frequent 1-Gram

ggplot(data=top_30_result_1, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") +  guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))

2-Gram

##2-gram
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus_tdm_2 <- TermDocumentMatrix(corpus,control = list(tokenize = bigram))
wordMatrix_2 <- as.data.frame((as.matrix(corpus_tdm_2)) ) 
wordSort_2 <- sort(rowSums(wordMatrix_2),decreasing=TRUE)
result_2 <- data.frame(word = names(wordSort_2),freq=wordSort_2)
top_30_result_2 <- result_2[1:30,]
top_30_result_2

##              word  freq
## of the     of the 25910
## in the     in the 24764
## for the   for the 13833
## to the     to the 13831
## on the     on the 12967
## to be       to be 11841
## at the     at the  8874
## i have     i have  7938
## and the   and the  7828
## i was       i was  7669
## is a         is a  7566
## in a         in a  7269
## i am         i am  7193
## and i       and i  7088
## it was     it was  6911
## it is       it is  6766
## for a       for a  6594
## with the with the  6575
## if you     if you  6519
## going to going to  6060
## have a     have a  5969
## is the     is the  5617
## will be   will be  5523
## to get     to get  5502
## from the from the  5287
## i dont     i dont  5132
## that i     that i  5097
## want to   want to  5082
## one of     one of  4992
## with a     with a  4930

Plot top 30 Frequent 2-Gram

ggplot(data=top_30_result_2, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") +  guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))

3-Gram

##3-gram
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus_tdm_3 <- TermDocumentMatrix(corpus,control = list(tokenize = trigram))
wordMatrix_3 <- as.data.frame((as.matrix(corpus_tdm_3)) ) 
wordSort_3 <- sort(rowSums(wordMatrix_3),decreasing=TRUE)
result_3 <- data.frame(word = names(wordSort_3),freq=wordSort_3)
top_30_result_3 <- result_3[1:30,]
top_30_result_3

##                                  word freq
## thanks for the         thanks for the 2330
## one of the                 one of the 2129
## a lot of                     a lot of 1933
## i want to                   i want to 1323
## to be a                       to be a 1291
## going to be               going to be 1245
## it was a                     it was a 1041
## cant wait to             cant wait to 1040
## i have a                     i have a 1036
## i have to                   i have to 1021
## looking forward to looking forward to 1018
## thank you for           thank you for  995
## i dont know               i dont know  969
## be able to                 be able to  961
## out of the                 out of the  952
## the end of                 the end of  942
## i love you                 i love you  941
## i need to                   i need to  913
## im going to               im going to  912
## the rest of               the rest of  881
## some of the               some of the  869
## as well as                 as well as  812
## for the follow         for the follow  806
## one of my                   one of my  805
## you want to               you want to  786
## this is a                   this is a  764
## is going to               is going to  736
## a couple of               a couple of  729
## to go to                     to go to  720
## the fact that           the fact that  707

Plot top 30 Frequent 3-Gram

ggplot(data=top_30_result_3, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") +  guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))

Next Step

For the final analysis, text modelling, and text prediction, we need to do the following studies:

1.N-Gram modelling of the full text data sets
2.Optimize model for low memory utilization and faster processing.
3.Implement model as a Shiny App