The project that we have selected for this course is "TEXT PREDICTION - EXPLORATORY ANALYSIS ON SWIFTKEY DATASETS". The HC Corpora dataset is comprised of the output of lots of news sites, blogs and twitter. The dataset contains 3 files across four languages (Russian, Finnish, German and English). This project will focus on the English language datasets. The names of the data files are as follows:
The datasets will be referred to as "Blogs", "Twitter" and "News" for the remainder of this report
| Words | Characters | Letters | Lines | Avg Word Length | Avg Words/Line | |
| Blogs | 37,242,000 | 206,824,000 | 163,815,000 | 899,000 | 4.4 | 41.41 |
| Newspapers | 34,275,000 | 203,223,000 | 162,803,000 | 1,010,000 | 4.75 | 33.93 |
| 29,876,000 | 162,122,000 | 125,998,000 | 2,360,000 | 4.22 | 12.66 | |
| Total | 101,393,000 | 572,170,000 | 452,617,000 | 4,269,000 | 4.46 | 23.75 |
The task is to build predictive model that predicts the next word when a user types a word/phrase similar to how google predicts what you want to search for, based on the most popular search terms.
From Swiftkey files Twitters, News, Blogs in the English Create a data product to predict the next word. The datasets from blogs and news contain approximately 200 megabytes and the Twitter dataset contains approximately 160 megabytes.
In this project we will apply Natural Language Processing (NLP), text mining, and the tools in R for exploratory data analysis and for the following text modelling and prediction as well.
We downloaded the datasets for this project from the web site and unzip it into our working directory
http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.The dataset is originally from HC Corpora
#To list Files list.files("./en_US/")
## character(0)
#No of Characters in Blog and No of Lines in Blog connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.blogs.txt", "r") blogs = readLines (connection, n = -1, encoding = "UTF-8") close (connection) nCharBlog = sum(nchar(blogs)); lenBlog = length(blogs) nCharBlog;lenBlog
## [1] 206824505
## [1] 899288
#No of Characters in News and No of Lines in News connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.news.txt", "r") news = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete ## final line found on 'E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/ ## en_US.news.txt'
close (connection) nCharNews = sum(nchar(news)); lenNews = length(news) nCharNews;lenNews
## [1] 15639408
## [1] 77259
#No of Characters in Twitter and No of Lines in Twitter connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.twitter.txt", "r") tweets = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155 ## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547 ## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086 ## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032 ## appears to contain an embedded nul
close (connection) nCharTweets = sum(nchar(tweets)); lenTweets = length (tweets) nCharTweets;lenTweets
## [1] 162096031
## [1] 2360148
#No of words in blogs wordblog=sum(sapply(strsplit(blogs, " "), length)) wordblog
## [1] 37334131
#No of words in news wordnews=sum(sapply(strsplit(news, " "), length)) wordnews
## [1] 2643969
#No of words in tweets wordtweet=sum(sapply(strsplit(tweets, " "), length)) wordtweet
## [1] 30373543
setwd("E:/Acadamics/Project") library(NLP) library(openNLP) library(tm) library(RWeka) library(qdapDictionaries)
## Warning: package 'qdapDictionaries' was built under R version 3.2.2
library(qdapRegex)
## Warning: package 'qdapRegex' was built under R version 3.2.2
library(qdapTools)
## Warning: package 'qdapTools' was built under R version 3.2.2
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 3.2.2
library(qdap)
## Warning: package 'qdap' was built under R version 3.2.2
library(stringr)
library(ggplot2)
library(RColorBrewer) library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.2.2
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.2
fileName = "Coursera-SwiftKey.zip" if (!file.exists (fileName)) download.file (url = "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = fileName) unzip (zipfile = fileName, overwrite = TRUE) list.files ("./final/")
## [1] "de_DE" "en_US" "fi_FI" "ru_RU"
list.files ("./final/en_US/")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
linesToRead = 100 connection = file ("./final/en_US/en_US.blogs.txt", "r") blogs = readLines (connection, n = -1, encoding = "UTF-8") close (connection) nCharBlog = nchar (blogs); lenBlog = length (blogs) connection = file ("./final/en_US/en_US.blogs.txt", "r") blogs = readLines (connection, n = linesToRead, encoding = "UTF-8") close (connection) connection = file ("./final/en_US/en_US.news.txt", "r") news = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete ## final line found on './final/en_US/en_US.news.txt'
close (connection) nCharNews = nchar (news); lenNews = length (news) connection = file ("./final/en_US/en_US.news.txt", "r") news = readLines (connection, n = linesToRead, encoding = "UTF-8") close (connection) connection = file ("./final/en_US/en_US.twitter.txt", "r") tweets = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155 ## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547 ## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086 ## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032 ## appears to contain an embedded nul
close (connection) nCharTweets = nchar (tweets); lenTweets = length (tweets) connection = file ("./final/en_US/en_US.twitter.txt", "r") tweets = readLines (connection, n = linesToRead, encoding = "UTF-8") close (connection) # Remove retweets tweets = gsub ("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets) # Remove @people tweets = gsub ("@\\w+", "", tweets) textData = c (blogs, news, tweets) #Replace abbreviations so that the sentences are not split at incorrect places. textData = replace_abbreviation (textData) # convert paragraphs to sentences myEndmarks = c("?", ".", "!", "|", ":", "\n", "\r\n") textData = sent_detect (textData, endmarks = myEndmarks, rm.bracket = FALSE) textCorpus = VCorpus (VectorSource (textData)) # Remove URL removeURL = function(x) gsub("http\\w+", "", x) textCorpus = tm_map(textCorpus, content_transformer(removeURL)) # Trim leading and ending white spaces textCorpus = tm_map(textCorpus, trim)
## Error in match.fun(FUN): object 'trim' not found
# Only after the above, change to lower case, remove the numbers and # punctuations textCorpus = tm_map(textCorpus, removePunctuation) textCorpus = tm_map(textCorpus, removeNumbers) textCorpus = tm_map(textCorpus, content_transformer(tolower)) # Remove profanity profanityFileName = "profanity.txt" if (!file.exists(profanityFileName)) download.file(url = "http://pattern-for-python.googlecode.com/svn-history/r20/trunk/pattern/vector/wordlists/profanity.txt", destfile = profanityFileName) profanityWords = str_trim(as.character(read.table(profanityFileName, sep = ",", stringsAsFactors = FALSE)))
## Warning in read.table(profanityFileName, sep = ",", stringsAsFactors = ## FALSE): incomplete final line found by readTableHeader on 'profanity.txt'
textCorpus = tm_map(textCorpus, removeWords, profanityWords) # Finally remove all the white space that was created by the removals textCorpus = tm_map(textCorpus, stripWhitespace) OneGramTokenizer = function (corpus) { NGramTokenizer (corpus, Weka_control (min = 1, max = 1)) } TwoGramTokenizer = function (corpus) { NGramTokenizer (corpus, Weka_control (min = 2, max = 2)) } ThreeGramTokenizer = function (corpus) { NGramTokenizer (corpus, Weka_control (min = 3, max = 3)) } tdmOneToken = TermDocumentMatrix (textCorpus, control = list (tokenize = OneGramTokenizer)) tdmTwoToken = TermDocumentMatrix (textCorpus, control = list (tokenize = TwoGramTokenizer)) tdmThreeToken = TermDocumentMatrix (textCorpus, control = list (tokenize = ThreeGramTokenizer)) OneTokenTermFreq = sort (rowSums (as.matrix (tdmOneToken)), decreasing = TRUE) OneTokenTermFreqPerc = 100 * (OneTokenTermFreq / sum (OneTokenTermFreq)) OneTokenTermFreqTopTwenty = head (OneTokenTermFreqPerc, 20) TwoTokenTermFreq = sort (rowSums (as.matrix (tdmTwoToken)), decreasing = TRUE) TwoTokenTermFreqPerc = 100 * (TwoTokenTermFreq / sum (TwoTokenTermFreq)) TwoTokenTermFreqTopTwenty = head (TwoTokenTermFreqPerc, 20) ThreeTokenTermFreq = sort (rowSums (as.matrix (tdmThreeToken)), decreasing = TRUE) ThreeTokenTermFreqPerc = 100 * (ThreeTokenTermFreq / sum (ThreeTokenTermFreq)) ThreeTokenTermFreqTopTwenty = head (ThreeTokenTermFreqPerc, 20) plot(qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty, main = "One Token Term Frequency: Top Twenty in Percentage", geom = "bar", stat = "identity", xlab = "Word", ylab = "% of all terms") + theme ( axis.text.x = element_text (angle = 45)))

plot(qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty, main = "Most Frequent 1-Grams", geom = "bar", stat = "identity", xlab = "Word", ylab = "% of all terms") + theme ( axis.text.x = element_text (angle = 45)))

plot(qplot (names (TwoTokenTermFreqTopTwenty), TwoTokenTermFreqTopTwenty, main = "Most Frequent 2-Grams", geom = "bar", stat = "identity", xlab = "Phrase", ylab = "% of all terms") + theme ( axis.text.x = element_text (angle = 45)))

qplot (names (ThreeTokenTermFreqTopTwenty), ThreeTokenTermFreqTopTwenty, main = "Most Frequent 3-Grams", geom = "bar", stat = "identity", xlab = "Phrase", ylab = "% of all terms") + theme ( axis.text.x = element_text (angle = 45))
