Team Members

  1. Jenifer PK
  2. Srinath Subramani KS

ABOUT DATASET:

The project that we have selected for this course is "TEXT PREDICTION - EXPLORATORY ANALYSIS ON SWIFTKEY DATASETS". The HC Corpora dataset is comprised of the output of lots of news sites, blogs and twitter. The dataset contains 3 files across four languages (Russian, Finnish, German and English). This project will focus on the English language datasets. The names of the data files are as follows:

  1. en_US.blogs.txt
  2. en_US.twitter.txt
  3. en_US.news.txt

The datasets will be referred to as "Blogs", "Twitter" and "News" for the remainder of this report

  Words Characters Letters Lines Avg Word Length Avg Words/Line
Blogs 37,242,000 206,824,000 163,815,000 899,000 4.4 41.41
Newspapers 34,275,000 203,223,000 162,803,000 1,010,000 4.75 33.93
Twitter 29,876,000 162,122,000 125,998,000 2,360,000 4.22 12.66
Total 101,393,000 572,170,000 452,617,000 4,269,000 4.46 23.75

TASK:

  1. Explore the data
  2. Profanity filtering - removing profanity and other words you do not want to predict.
  3. Tokenization - identifying appropriate tokens such as words, punctuation, and numbers.
  4. Train the data Natural Language Process
  5. Build a shiny apps with the model

The task is to build predictive model that predicts the next word when a user types a word/phrase similar to how google predicts what you want to search for, based on the most popular search terms.

From Swiftkey files Twitters, News, Blogs in the English Create a data product to predict the next word. The datasets from blogs and news contain approximately 200 megabytes and the Twitter dataset contains approximately 160 megabytes.

In this project we will apply Natural Language Processing (NLP), text mining, and the tools in R for exploratory data analysis and for the following text modelling and prediction as well.

DATASET DOWNLOAD:

We downloaded the datasets for this project from the web site and unzip it into our working directory

http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.

The dataset is originally from HC Corpora

#To list Files
list.files("./en_US/")
## character(0)
#No of Characters in Blog and No of Lines in Blog

connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = -1, encoding = "UTF-8")
close (connection)
nCharBlog = sum(nchar(blogs)); lenBlog = length(blogs)
nCharBlog;lenBlog
## [1] 206824505
## [1] 899288
#No of Characters in News and No of Lines in News
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.news.txt", "r")
news = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete
## final line found on 'E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/
## en_US.news.txt'
close (connection)
nCharNews = sum(nchar(news)); lenNews = length(news)
nCharNews;lenNews
## [1] 15639408
## [1] 77259
#No of Characters in Twitter and No of Lines in Twitter
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul
close (connection)
nCharTweets = sum(nchar(tweets)); lenTweets = length (tweets)
nCharTweets;lenTweets
## [1] 162096031
## [1] 2360148
#No of words in blogs
wordblog=sum(sapply(strsplit(blogs, " "), length))
wordblog
## [1] 37334131
#No of words in news
wordnews=sum(sapply(strsplit(news, " "), length))
wordnews
## [1] 2643969
#No of words in tweets
wordtweet=sum(sapply(strsplit(tweets, " "), length))
wordtweet
## [1] 30373543
setwd("E:/Acadamics/Project")
library(NLP)
library(openNLP)
library(tm)
library(RWeka)
library(qdapDictionaries)
## Warning: package 'qdapDictionaries' was built under R version 3.2.2
library(qdapRegex)
## Warning: package 'qdapRegex' was built under R version 3.2.2
library(qdapTools)
## Warning: package 'qdapTools' was built under R version 3.2.2
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 3.2.2
library(qdap)
## Warning: package 'qdap' was built under R version 3.2.2
## 
## Attaching package: 'qdap'
## 
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## 
## The following object is masked from 'package:NLP':
## 
##     ngrams
## 
## The following object is masked from 'package:base':
## 
##     Filter
library(stringr)
## 
## Attaching package: 'stringr'
## 
## The following object is masked from 'package:qdap':
## 
##     %>%
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(RColorBrewer)
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.2.2
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.2
fileName = "Coursera-SwiftKey.zip"
if (!file.exists (fileName))
  download.file (url = "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = fileName)

unzip (zipfile = fileName, overwrite = TRUE)
list.files ("./final/")
## [1] "de_DE" "en_US" "fi_FI" "ru_RU"
list.files ("./final/en_US/")
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"
linesToRead = 100

connection = file ("./final/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = -1, encoding = "UTF-8")
close (connection)
nCharBlog = nchar (blogs); lenBlog = length (blogs)

connection = file ("./final/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = linesToRead, encoding = "UTF-8")
close (connection)

connection = file ("./final/en_US/en_US.news.txt", "r")
news = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete
## final line found on './final/en_US/en_US.news.txt'
close (connection)
nCharNews = nchar (news); lenNews = length (news)

connection = file ("./final/en_US/en_US.news.txt", "r")
news = readLines (connection, n = linesToRead, encoding = "UTF-8")
close (connection)

connection = file ("./final/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul
close (connection)
nCharTweets = nchar (tweets); lenTweets = length (tweets)

connection = file ("./final/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = linesToRead, encoding = "UTF-8")
close (connection)

# Remove retweets
tweets = gsub ("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets)
# Remove @people
tweets = gsub ("@\\w+", "", tweets)

textData = c (blogs, news, tweets)

#Replace abbreviations so that the sentences are not split at incorrect places.
textData = replace_abbreviation (textData)

# convert paragraphs to sentences
myEndmarks = c("?", ".", "!", "|", ":", "\n", "\r\n")
textData = sent_detect (textData, endmarks = myEndmarks, rm.bracket = FALSE)

textCorpus = VCorpus (VectorSource (textData))

# Remove URL
removeURL = function(x) gsub("http\\w+", "", x)
textCorpus = tm_map(textCorpus, content_transformer(removeURL))

# Trim leading and ending white spaces
textCorpus = tm_map(textCorpus, trim)
## Error in match.fun(FUN): object 'trim' not found
# Only after the above, change to lower case, remove the numbers and
# punctuations
textCorpus = tm_map(textCorpus, removePunctuation)
textCorpus = tm_map(textCorpus, removeNumbers)
textCorpus = tm_map(textCorpus, content_transformer(tolower))

# Remove profanity
profanityFileName = "profanity.txt"
if (!file.exists(profanityFileName)) download.file(url = "http://pattern-for-python.googlecode.com/svn-history/r20/trunk/pattern/vector/wordlists/profanity.txt",
                                                   destfile = profanityFileName)
profanityWords = str_trim(as.character(read.table(profanityFileName, sep = ",",
                                                  stringsAsFactors = FALSE)))
## Warning in read.table(profanityFileName, sep = ",", stringsAsFactors =
## FALSE): incomplete final line found by readTableHeader on 'profanity.txt'
textCorpus = tm_map(textCorpus, removeWords, profanityWords)

# Finally remove all the white space that was created by the removals
textCorpus = tm_map(textCorpus, stripWhitespace)


OneGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 1, max = 1))
}

TwoGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 2, max = 2))
}

ThreeGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 3, max = 3))
}

tdmOneToken = TermDocumentMatrix (textCorpus,
                                  control = list (tokenize = OneGramTokenizer))

tdmTwoToken = TermDocumentMatrix (textCorpus,
                                  control = list (tokenize = TwoGramTokenizer))

tdmThreeToken = TermDocumentMatrix (textCorpus,
                                    control = list (tokenize = ThreeGramTokenizer))


OneTokenTermFreq = sort (rowSums (as.matrix (tdmOneToken)), decreasing = TRUE)
OneTokenTermFreqPerc = 100 * (OneTokenTermFreq / sum (OneTokenTermFreq))
OneTokenTermFreqTopTwenty = head (OneTokenTermFreqPerc, 20)

TwoTokenTermFreq = sort (rowSums (as.matrix (tdmTwoToken)), decreasing = TRUE)
TwoTokenTermFreqPerc = 100 * (TwoTokenTermFreq / sum (TwoTokenTermFreq))
TwoTokenTermFreqTopTwenty = head (TwoTokenTermFreqPerc, 20)

ThreeTokenTermFreq = sort (rowSums (as.matrix (tdmThreeToken)), decreasing = TRUE)
ThreeTokenTermFreqPerc = 100 * (ThreeTokenTermFreq / sum (ThreeTokenTermFreq))
ThreeTokenTermFreqTopTwenty = head (ThreeTokenTermFreqPerc, 20)

plot(qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty,
       main = "One Token Term Frequency: Top Twenty in Percentage",
       geom = "bar", stat = "identity",
       xlab = "Word", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45)))
plot of chunk unnamed-chunk-1
plot(qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty,
       main = "Most Frequent 1-Grams",
       geom = "bar", stat = "identity",
       xlab = "Word", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45)))
plot of chunk unnamed-chunk-1
plot(qplot (names (TwoTokenTermFreqTopTwenty), TwoTokenTermFreqTopTwenty,
       main = "Most Frequent 2-Grams",
       geom = "bar", stat = "identity",
       xlab = "Phrase", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45)))
plot of chunk unnamed-chunk-1
qplot (names (ThreeTokenTermFreqTopTwenty), ThreeTokenTermFreqTopTwenty,
       main = "Most Frequent 3-Grams",
       geom = "bar", stat = "identity",
       xlab = "Phrase", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45))
plot of chunk unnamed-chunk-1