setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project")
library(NLP)
library(openNLP)
library(tm)
library(RWeka)
library(qdapDictionaries)
library(qdapRegex)
library(qdapTools)
library(RColorBrewer)
library(qdap)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(SnowballC)
library(wordcloud)


fileName = "Coursera-SwiftKey.zip"
if(!file.exists(fileName)){
  #Download the dataset
  download.file(url="https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
                "Coursera-SwiftKey.zip",destfile = fileName)
  
  Download_Date <- Sys.time()
  Download_Date
  
  #Unzip the dataset
  unzip(zipfile = fileName,overwrite = TRUE)
}else{
  print("Dataset is already downloaded!")
}

## [1] "Dataset is already downloaded!"

#To list Files
list.files("./final")

## [1] "de_DE" "en_US" "fi_FI" "ru_RU"

list.files("./final/en_US/")

## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

list.files("./en_US/")

## [1] "en_US.blogs.txt"        "en_US.news.txt"        
## [3] "en_US.twitter.txt"      "profanity.txt"         
## [5] "SampleDataBlog1.txt"    "SampleDataNews1.txt"   
## [7] "SampleDataTwitter1.txt"

#No of Characters in Blog and No of Lines in Blog

connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.blogs.txt", "r")
blogs = readLines (connection, n = -1, encoding = "UTF-8")
close (connection)
nCharBlog = sum(nchar(blogs)); lenBlog = length(blogs)
nCharBlog;lenBlog

## [1] 206824505

## [1] 899288

#No of Characters in News and No of Lines in News
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.news.txt", "r")
news = readLines (connection, n = -1, encoding = "UTF-8")

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): incomplete
## final line found on 'E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/
## en_US.news.txt'

close (connection)
nCharNews = sum(nchar(news)); lenNews = length(news)
nCharNews;lenNews

## [1] 15639408

## [1] 77259

#No of Characters in Twitter and No of Lines in Twitter
connection = file ("file:///E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US/en_US.twitter.txt", "r")
tweets = readLines (connection, n = -1, encoding = "UTF-8")

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul

## Warning in readLines(connection, n = -1, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul

close (connection)
nCharTweets = sum(nchar(tweets)); lenTweets = length (tweets)
nCharTweets;lenTweets

## [1] 162096031

## [1] 2360148

#No of words in blogs
wordblog=sum(sapply(strsplit(blogs, " "), length))
wordblog

## [1] 37334131

#No of words in news
wordnews=sum(sapply(strsplit(news, " "), length))
wordnews

## [1] 2643969

#No of words in tweets
wordtweet=sum(sapply(strsplit(tweets, " "), length))
wordtweet

## [1] 30373543

#SAMPLING OF DATA

#Sampling Blog - SampleDataBlog1
setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US")
conblog <- file("./en_US.blogs.txt", "r") 
nb <- length(conblog)
sampleSize <- 8000
SampleDataBlog1 <-readLines(conblog, (nb/sampleSize),encoding="latin1")
writeLines(SampleDataBlog1, con="SampleDataBlog1.txt", "\n")
close(conblog)

#Sampling News - SampleDataNews1
setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US")
connews <- file("./en_US.news.txt", "r") 
nn <- length(connews)
sampleSize <- 8000
SampleDataNews1 <-readLines(connews, (nb/sampleSize),encoding="latin1")
writeLines(SampleDataNews1, con="SampleDataNews1.txt", "\n")
close(connews)

#Sampling Twitter - SampleDataTwitter1
setwd("E:/LIBA_EDBA/PL/EDBA- PL-1st Sem-Project/en_US")
contwitter <- file("./en_US.twitter.txt", "r") 
nt <- length(contwitter)
sampleSize <- 8000
SampleDataTwitter1 <-readLines(contwitter, (nb/sampleSize),encoding="latin1")
writeLines(SampleDataTwitter1, con="SampleDataTwitter1.txt", "\n")
close(contwitter)

#Data processing - Tokenization and Profanity Fiiltering

# Remove retweets
tweets = gsub ("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets)
# Remove @people
tweets = gsub ("@\\w+", "", tweets)

Sampledata =c(SampleDataBlog1, SampleDataNews1, SampleDataTwitter1)  
Sampledata <- sent_detect(Sampledata, language = "en", model = NULL) # splitting of text paragraphs into sentences.

corpusdata <- VCorpus(VectorSource(Sampledata)) # main corpus with all sample files
removeURL = function(x) gsub("http\\w+", "", x)
corpusdata <- tm_map(corpusdata, removeNumbers) 
corpusdata <- tm_map(corpusdata, stripWhitespace) 
corpusdata <- tm_map(corpusdata, tolower) 
corpusdata <- tm_map(corpusdata, removePunctuation)
corpusdata <- tm_map(corpusdata, removeWords, stopwords("english"))
corpusdata <- tm_map(corpusdata, PlainTextDocument)


# Remove profanity
profanityFileName = "profanity.txt"
if (!file.exists(profanityFileName)) download.file(url = "http://pattern-for-python.googlecode.com/svn-history/r20/trunk/pattern/vector/wordlists/profanity.txt", 
                                                   destfile = profanityFileName)
profanityWords = str_trim(as.character(read.table(profanityFileName, sep = ",", 
                                                  stringsAsFactors = FALSE)))

## Warning in read.table(profanityFileName, sep = ",", stringsAsFactors =
## FALSE): incomplete final line found by readTableHeader on 'profanity.txt'

corpusdata = tm_map(corpusdata, removeWords, profanityWords)


# Trim leading and ending white spaces
corpusdata = tm_map(corpusdata, Trim)

# Finally remove all the white space that was created by the removals
corpusdata = tm_map(corpusdata, stripWhitespace)

#corpusdata <- tm_map(corpusdata, PlainTextDocument) 

#Replace abbreviations so that the sentences are not split at incorrect places.
corpusdata = replace_abbreviation(corpusdata)


#Corpus is tokenized into 1,2 and 3 grams and Term Document Matrices created to understand the frequency of words and phrases. This information is to help further with the modeling.
#Using the RWeka package for the 1-gram(single word) tokenization, 2-grams sets 
# and 3-grams sets for further exploratory analysis.

OneGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 1, max = 1))
}

TwoGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 2, max = 2))
}

ThreeGramTokenizer = function (corpus) {
  NGramTokenizer (corpus, Weka_control (min = 3, max = 3))
}

tdmOneToken = as.TermDocumentMatrix (corpusdata, 
                                  control = list (tokenize = OneGramTokenizer))
tdmOneToken <- removeSparseTerms(tdmOneToken,0.99)

tdmTwoToken = as.TermDocumentMatrix (corpusdata, 
                                  control = list (tokenize = TwoGramTokenizer))
tdmTwoToken <- removeSparseTerms(tdmTwoToken,0.99)

tdmThreeToken = as.TermDocumentMatrix (corpusdata, 
                                    control = list (tokenize = ThreeGramTokenizer))
tdmThreeToken <- removeSparseTerms(tdmThreeToken,0.99)


OneTokenTermFreq = sort (rowSums (as.matrix (tdmOneToken)), decreasing = TRUE)
OneTokenTermFreqPerc = 100 * (OneTokenTermFreq / sum (OneTokenTermFreq))
OneTokenTermFreqTopTwenty = head (OneTokenTermFreqPerc, 20)


TwoTokenTermFreq = sort (rowSums (as.matrix (tdmTwoToken)), decreasing = TRUE)
TwoTokenTermFreqPerc = 100 * (TwoTokenTermFreq / sum (TwoTokenTermFreq))
TwoTokenTermFreqTopTwenty = head (TwoTokenTermFreqPerc, 20)

ThreeTokenTermFreq = sort (rowSums (as.matrix (tdmThreeToken)), decreasing = TRUE)
ThreeTokenTermFreqPerc = 100 * (ThreeTokenTermFreq / sum (ThreeTokenTermFreq))
ThreeTokenTermFreqTopTwenty = head (ThreeTokenTermFreqPerc, 20)

p=qplot (names (OneTokenTermFreqTopTwenty), OneTokenTermFreqTopTwenty,
       main = "Most Frequent 1-Grams",
       geom = "bar", stat = "identity",
       xlab = "Word", ylab = "% of all terms") + theme (
         axis.text.x = element_text (angle = 45))
print(p)