TEXT PREDICTION - EXPLORATORY ANALYSIS ON SWIFTKEY DATASETS

The project that we have selected for this course is "TEXT PREDICTION - EXPLORATORY ANALYSIS ON SWIFTKEY DATASETS". The HC Corpora dataset is comprised of the output of lots of news sites, blogs and twitter. The dataset contains 3 files across four languages (Russian, Finnish, German and English). This project will focus on the English language datasets. The names of the data files are as follows:

The datasets will be referred to as "Blogs", "Twitter" and "News" for the remainder of this report

TASK:

The task is to build predictive model that predicts the next word when a user types a word/phrase similar to how google predicts what you want to search for, based on the most popular search terms.

From Swiftkey files Twitters, News, Blogs in the English Create a data product to predict the next word. The datasets from blogs and news contain approximately 200 megabytes and the Twitter dataset contains approximately 160 megabytes.

In this project we will apply Natural Language Processing (NLP), text mining, and the tools in R for exploratory data analysis and for the following text modelling and prediction as well.

DATASET DOWNLOAD:

We downloaded the datasets for this project from the web site and unzip it into our working directory

#To list Files
list.files("./en_US/")

## character(0)

#No of Characters in Blog and No of Lines in Blog

connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = -1, encoding = "UTF-8")
close (connection)
nCharBlog = sum(nchar(blogs)); lenBlog = length(blogs)
nCharBlog;lenBlog

## [1] 206824505

## [1] 899288

#No of Characters in News and No of Lines in News
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.news.txt", "r")
news = readLines (connection, n = -1, encoding = "UTF-8")

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete
## final line found on 'E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/
## en_US.news.txt'

close (connection)
nCharNews = sum(nchar(news)); lenNews = length(news)
nCharNews;lenNews

## [1] 15639408

## [1] 77259

#No of Characters in Twitter and No of Lines in Twitter
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = -1, encoding = "UTF-8")

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul

close (connection)
nCharTweets = sum(nchar(tweets)); lenTweets = length (tweets)
nCharTweets;lenTweets

## [1] 162096031

## [1] 2360148

#No of words in blogs
wordblog=sum(sapply(strsplit(blogs, " "), length))
wordblog

## [1] 37334131

#No of words in news
wordnews=sum(sapply(strsplit(news, " "), length))
wordnews

## [1] 2643969

#No of words in tweets
wordtweet=sum(sapply(strsplit(tweets, " "), length))
wordtweet

## [1] 30373543

setwd("E:/Acadamics/Project")
library(NLP)
library(openNLP)
library(tm)
library(RWeka)
library(qdapDictionaries)

## Warning: package 'qdapDictionaries' was built under R version 3.2.2

library(qdapRegex)

## Warning: package 'qdapRegex' was built under R version 3.2.2

library(qdapTools)

## Warning: package 'qdapTools' was built under R version 3.2.2

library(RColorBrewer)

## Warning: package 'RColorBrewer' was built under R version 3.2.2

library(qdap)

## Warning: package 'qdap' was built under R version 3.2.2

## 
## Attaching package: 'qdap'
## 
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## 
## The following object is masked from 'package:NLP':
## 
##     ngrams
## 
## The following object is masked from 'package:base':
## 
##     Filter

library(stringr)

## 
## Attaching package: 'stringr'
## 
## The following object is masked from 'package:qdap':
## 
##     %>%

library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
## 
## The following object is masked from 'package:NLP':
## 
##     annotate

library(RColorBrewer)
library(SnowballC)

## Warning: package 'SnowballC' was built under R version 3.2.2

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.2.2

fileName = "Coursera-SwiftKey.zip"
if (!file.exists (fileName))
  download.file (url = "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = fileName)

unzip (zipfile = fileName, overwrite = TRUE)
list.files ("./final/")

## [1] "de_DE" "en_US" "fi_FI" "ru_RU"

list.files ("./final/en_US/")

## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

linesToRead = 100

connection = file ("./final/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = -1, encoding = "UTF-8")
close (connection)
nCharBlog = nchar (blogs); lenBlog = length (blogs)

connection = file ("./final/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = linesToRead, encoding = "UTF-8")
close (connection)

connection = file ("./final/en_US/en_US.news.txt", "r")
news = readLines (connection, n = -1, encoding = "UTF-8")

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete
## final line found on './final/en_US/en_US.news.txt'

close (connection)
nCharNews = nchar (news); lenNews = length (news)

connection = file ("./final/en_US/en_US.news.txt", "r")
news = readLines (connection, n = linesToRead, encoding = "UTF-8")
close (connection)

connection = file ("./final/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = -1, encoding = "UTF-8")

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul

close (connection)
nCharTweets = nchar (tweets); lenTweets = length (tweets)

connection = file ("./final/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = linesToRead, encoding = "UTF-8")
close (connection)

# Remove retweets
tweets = gsub ("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets)
# Remove @people
tweets = gsub ("@\\w+", "", tweets)

textData = c (blogs, news, tweets)

#Replace abbreviations so that the sentences are not split at incorrect places.
textData = replace_abbreviation (textData)

# convert paragraphs to sentences
myEndmarks = c("?", ".", "!", "|", ":", "\n", "\r\n")
textData = sent_detect (textData, endmarks = myEndmarks, rm.bracket = FALSE)

textCorpus = VCorpus (VectorSource (textData))

# Remove URL
removeURL = function(x) gsub("http\\w+", "", x)
textCorpus = tm_map(textCorpus, content_transformer(removeURL))

# Trim leading and ending white spaces
textCorpus = tm_map(textCorpus, trim)

## Error in match.fun(FUN): object 'trim' not found

# Only after the above, change to lower case, remove the numbers and
# punctuations
textCorpus = tm_map(textCorpus, removePunctuation)
textCorpus = tm_map(textCorpus, removeNumbers)
textCorpus = tm_map(textCorpus, content_transformer(tolower))

# Remove profanity
profanityFileName = "profanity.txt"
if (!file.exists(profanityFileName)) download.file(url = "http://pattern-for-python.googlecode.com/svn-history/r20/trunk/pattern/vector/wordlists/profanity.txt",
                                                   destfile = profanityFileName)
profanityWords = str_trim(as.character(read.table(profanityFileName, sep = ",",
                                                  stringsAsFactors = FALSE)))

## Warning in read.table(profanityFileName, sep = ",", stringsAsFactors =
## FALSE): incomplete final line found by readTableHeader on 'profanity.txt'

textCorpus = tm_map(textCorpus, removeWords, profanityWords)

# Finally remove all the white space that was created by the removals
textCorpus = tm_map(textCorpus, stripWhitespace)


OneGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 1, max = 1))
}

TwoGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 2, max = 2))
}

ThreeGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 3, max = 3))
}

tdmOneToken = TermDocumentMatrix (textCorpus,
                                  control = list (tokenize = OneGramTokenizer))

tdmTwoToken = TermDocumentMatrix (textCorpus,
                                  control = list (tokenize = TwoGramTokenizer))

tdmThreeToken = TermDocumentMatrix (textCorpus,
                                    control = list (tokenize = ThreeGramTokenizer))


OneTokenTermFreq = sort (rowSums (as.matrix (tdmOneToken)), decreasing = TRUE)
OneTokenTermFreqPerc = 100 * (OneTokenTermFreq / sum (OneTokenTermFreq))
OneTokenTermFreqTopTwenty = head (OneTokenTermFreqPerc, 20)

TwoTokenTermFreq = sort (rowSums (as.matrix (tdmTwoToken)), decreasing = TRUE)
TwoTokenTermFreqPerc = 100 * (TwoTokenTermFreq / sum (TwoTokenTermFreq))
TwoTokenTermFreqTopTwenty = head (TwoTokenTermFreqPerc, 20)

ThreeTokenTermFreq = sort (rowSums (as.matrix (tdmThreeToken)), decreasing = TRUE)
ThreeTokenTermFreqPerc = 100 * (ThreeTokenTermFreq / sum (ThreeTokenTermFreq))
ThreeTokenTermFreqTopTwenty = head (ThreeTokenTermFreqPerc, 20)

plot(qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty,
       main = "One Token Term Frequency: Top Twenty in Percentage",
       geom = "bar", stat = "identity",
       xlab = "Word", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45)))

plot(qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty,
       main = "Most Frequent 1-Grams",
       geom = "bar", stat = "identity",
       xlab = "Word", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45)))

plot(qplot (names (TwoTokenTermFreqTopTwenty), TwoTokenTermFreqTopTwenty,
       main = "Most Frequent 2-Grams",
       geom = "bar", stat = "identity",
       xlab = "Phrase", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45)))

qplot (names (ThreeTokenTermFreqTopTwenty), ThreeTokenTermFreqTopTwenty,
       main = "Most Frequent 3-Grams",
       geom = "bar", stat = "identity",
       xlab = "Phrase", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45))

	Words	Characters	Letters	Lines	Avg Word Length	Avg Words/Line
Blogs	37,242,000	206,824,000	163,815,000	899,000	4.4	41.41
Newspapers	34,275,000	203,223,000	162,803,000	1,010,000	4.75	33.93
Twitter	29,876,000	162,122,000	125,998,000	2,360,000	4.22	12.66
Total	101,393,000	572,170,000	452,617,000	4,269,000	4.46	23.75

Team Members

ABOUT DATASET:

TASK:

DATASET DOWNLOAD: