setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project")
library(NLP)
library(openNLP)
library(tm)
library(RWeka)
library(qdapDictionaries)
library(qdapRegex)
library(qdapTools)
library(RColorBrewer)
library(qdap)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(SnowballC)
library(wordcloud)
fileName = "Coursera-SwiftKey.zip"
if(!file.exists(fileName)){
#Download the dataset
download.file(url="https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
"Coursera-SwiftKey.zip",destfile = fileName)
Download_Date <- Sys.time()
Download_Date
#Unzip the dataset
unzip(zipfile = fileName,overwrite = TRUE)
}else{
print("Dataset is already downloaded!")
}
## [1] "Dataset is already downloaded!"
#To list Files
list.files("./final")
## [1] "de_DE" "en_US" "fi_FI" "ru_RU"
list.files("./final/en_US/")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
list.files("./en_US/")
## [1] "en_US.blogs.txt" "en_US.news.txt"
## [3] "en_US.twitter.txt" "profanity.txt"
## [5] "SampleDataBlog1.txt" "SampleDataNews1.txt"
## [7] "SampleDataTwitter1.txt"
#No of Characters in Blog and No of Lines in Blog
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = -1, encoding = "UTF-8")
close (connection)
nCharBlog = sum(nchar(blogs)); lenBlog = length(blogs)
nCharBlog;lenBlog
## [1] 206824505
## [1] 899288
#No of Characters in News and No of Lines in News
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.news.txt", "r")
news = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete
## final line found on 'E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/
## en_US.news.txt'
close (connection)
nCharNews = sum(nchar(news)); lenNews = length(news)
nCharNews;lenNews
## [1] 15639408
## [1] 77259
#No of Characters in Twitter and No of Lines in Twitter
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = -1, encoding = "UTF-8")
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul
## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul
close (connection)
nCharTweets = sum(nchar(tweets)); lenTweets = length (tweets)
nCharTweets;lenTweets
## [1] 162096031
## [1] 2360148
#No of words in blogs
wordblog=sum(sapply(strsplit(blogs, " "), length))
wordblog
## [1] 37334131
#No of words in news
wordnews=sum(sapply(strsplit(news, " "), length))
wordnews
## [1] 2643969
#No of words in tweets
wordtweet=sum(sapply(strsplit(tweets, " "), length))
wordtweet
## [1] 30373543
#SAMPLING OF DATA
#Sampling Blog - SampleDataBlog1
setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US")
conblog <- file("./en_US.blogs.txt", "r")
nb <- length(conblog)
sampleSize <- 8000
SampleDataBlog1 <-readLines(conblog, (nb/sampleSize),encoding="latin1")
writeLines(SampleDataBlog1, con="SampleDataBlog1.txt", "\n")
close(conblog)
#Sampling News - SampleDataNews1
setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US")
connews <- file("./en_US.news.txt", "r")
nn <- length(connews)
sampleSize <- 8000
SampleDataNews1 <-readLines(connews, (nb/sampleSize),encoding="latin1")
writeLines(SampleDataNews1, con="SampleDataNews1.txt", "\n")
close(connews)
#Sampling Twitter - SampleDataTwitter1
setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US")
contwitter <- file("./en_US.twitter.txt", "r")
nt <- length(contwitter)
sampleSize <- 8000
SampleDataTwitter1 <-readLines(contwitter, (nb/sampleSize),encoding="latin1")
writeLines(SampleDataTwitter1, con="SampleDataTwitter1.txt", "\n")
close(contwitter)
#Data processing - Tokenization and Profanity Fiiltering
# Remove retweets
tweets = gsub ("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets)
# Remove @people
tweets = gsub ("@\\w+", "", tweets)
Sampledata =c(SampleDataBlog1, SampleDataNews1, SampleDataTwitter1)
Sampledata <- sent_detect(Sampledata, language = "en", model = NULL) # splitting of text paragraphs into sentences.
corpusdata <- VCorpus(VectorSource(Sampledata)) # main corpus with all sample files
removeURL = function(x) gsub("http\\w+", "", x)
corpusdata <- tm_map(corpusdata, removeNumbers)
corpusdata <- tm_map(corpusdata, stripWhitespace)
corpusdata <- tm_map(corpusdata, tolower)
corpusdata <- tm_map(corpusdata, removePunctuation)
corpusdata <- tm_map(corpusdata, removeWords, stopwords("english"))
corpusdata <- tm_map(corpusdata, PlainTextDocument)
# Remove profanity
profanityFileName = "profanity.txt"
if (!file.exists(profanityFileName)) download.file(url = "http://pattern-for-python.googlecode.com/svn-history/r20/trunk/pattern/vector/wordlists/profanity.txt",
destfile = profanityFileName)
profanityWords = str_trim(as.character(read.table(profanityFileName, sep = ",",
stringsAsFactors = FALSE)))
## Warning in read.table(profanityFileName, sep = ",", stringsAsFactors =
## FALSE): incomplete final line found by readTableHeader on 'profanity.txt'
corpusdata = tm_map(corpusdata, removeWords, profanityWords)
# Trim leading and ending white spaces
corpusdata = tm_map(corpusdata, Trim)
# Finally remove all the white space that was created by the removals
corpusdata = tm_map(corpusdata, stripWhitespace)
#corpusdata <- tm_map(corpusdata, PlainTextDocument)
#Replace abbreviations so that the sentences are not split at incorrect places.
corpusdata = replace_abbreviation(corpusdata)
#Corpus is tokenized into 1,2 and 3 grams and Term Document Matrices created to understand the frequency of words and phrases. This information is to help further with the modeling.
#Using the RWeka package for the 1-gram(single word) tokenization, 2-grams sets
# and 3-grams sets for further exploratory analysis.
OneGramTokenizer = function (corpus) {
NGramTokenizer (corpus, Weka_control (min = 1, max = 1))
}
TwoGramTokenizer = function (corpus) {
NGramTokenizer (corpus, Weka_control (min = 2, max = 2))
}
ThreeGramTokenizer = function (corpus) {
NGramTokenizer (corpus, Weka_control (min = 3, max = 3))
}
tdmOneToken = as.TermDocumentMatrix (corpusdata,
control = list (tokenize = OneGramTokenizer))
tdmOneToken <- removeSparseTerms(tdmOneToken,0.99)
tdmTwoToken = as.TermDocumentMatrix (corpusdata,
control = list (tokenize = TwoGramTokenizer))
tdmTwoToken <- removeSparseTerms(tdmTwoToken,0.99)
tdmThreeToken = as.TermDocumentMatrix (corpusdata,
control = list (tokenize = ThreeGramTokenizer))
tdmThreeToken <- removeSparseTerms(tdmThreeToken,0.99)
OneTokenTermFreq = sort (rowSums (as.matrix (tdmOneToken)), decreasing = TRUE)
OneTokenTermFreqPerc = 100 * (OneTokenTermFreq / sum (OneTokenTermFreq))
OneTokenTermFreqTopTwenty = head (OneTokenTermFreqPerc, 20)
TwoTokenTermFreq = sort (rowSums (as.matrix (tdmTwoToken)), decreasing = TRUE)
TwoTokenTermFreqPerc = 100 * (TwoTokenTermFreq / sum (TwoTokenTermFreq))
TwoTokenTermFreqTopTwenty = head (TwoTokenTermFreqPerc, 20)
ThreeTokenTermFreq = sort (rowSums (as.matrix (tdmThreeToken)), decreasing = TRUE)
ThreeTokenTermFreqPerc = 100 * (ThreeTokenTermFreq / sum (ThreeTokenTermFreq))
ThreeTokenTermFreqTopTwenty = head (ThreeTokenTermFreqPerc, 20)
p=qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty,
main = "Most Frequent 1-Grams",
geom = "bar", stat = "identity",
xlab = "Word", ylab = "% of all terms") + theme (
axis.text.x = element_text (angle = 45))
print(p)
