This report is part of final capstone project of Coursera-Johns Hopkins University Data Science specialization. In this report, we are exploring English text data sets (blogs, news and twitter) to summarize the following:
Basic report of statistics of datasets - word counts, line counts and basic data table
Major features of the data sets using histograms and other graphs
Plan for the prediction algorithm and shinyapp
SwiftKey Data zip file is downloaded from the swifturl provided in the Coursera DS Capstone Project Milestone Assignment.
if (!file.exists("./textData")) {
dir.create("./textData")
}
swifturl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## if file is not already downloaded, download the file
if (!file.exists("/Users/fathims/Desktop/R Practice/textData/Coursera-SwiftKey.zip")){
download.file(swifturl, destfile ="/Users/fathims/Desktop/RPractice/textData/Coursera-SwiftKey.zip",mode = "wb")
}
## Unzip file to working directory if the directory does not yet exist exists
if (!file.exists("/Users/fathims/Desktop/R Practice/textData/final")) {
unzip("/Users/fathims/Desktop/R Practice/textData/Coursera-SwiftKey.zip", exdir = "/Users/fathims/Desktop/R Practice/textData")
}
library(dplyr, warn.conflicts=FALSE)
con_tweets <- file("/Users/fathims/Desktop/R Practice/textData/final/en_US/en_US.twitter.txt","r")
con_news<-file("/Users/fathims/Desktop/R Practice/textData/final/en_US/en_US.news.txt","r")
con_blogs<-file("/Users/fathims/Desktop/R Practice/textData/final/en_US/en_US.blogs.txt","r")
tweetdata<- readLines(con_tweets,encoding= "UTF-8",skipNul = TRUE);close(con_tweets)
blogdata <- readLines(con_blogs,encoding= "UTF-8",skipNul = TRUE);close(con_blogs)
newsdata <- readLines(con_news,encoding= "UTF-8",skipNul=TRUE);close(con_news)
library(stringi)
# number of lines in each file
lt <-length(tweetdata)
ln <-length(newsdata)
lb <-length(blogdata)
# number of words in each file
wt <- sum(stri_count(tweetdata,regex="\\S+"))
wn <- sum(stri_count(newsdata,regex="\\S+"))
wb <- sum(stri_count(blogdata,regex="\\S+"))
# number of characters in each file
cht <-sum(nchar(tweetdata, type = "chars"))
chn <- sum(nchar(newsdata, type = "chars"))
chb <- sum(nchar(blogdata, type = "chars"))
# Size of the files in MB
size_MB <-c(file.info("/Users/fathims/Desktop/R Practice/textData/final/en_US/en_US.twitter.txt")$size/1024^2,file.info("/Users/fathims/Desktop/R Practice/textData/final/en_US/en_US.news.txt")$size/1024^2,file.info("/Users/fathims/Desktop/R Practice/textData/final/en_US/en_US.blogs.txt")$size/1024^2)
##Data table of the text data
TextTable <- data.frame(File = c("en_US.twitter","en_US.news", "en_US.blog"),Total_lines = c(lt,ln,lb),Total_words = c(wt,wn,wb),Total_characters =c(cht,chn,chb),Average_no_words =round(c((wt/lt),(wn/ln),(wb/lb))),Size=round(size_MB))
##removed the files to save memory space
rm(lt,ln,lb,wt,wn,wb,cht,chn,chb,size_MB)
library(tibble)
print(as_tibble(TextTable))
## # A tibble: 3 x 6
## File Total_lines Total_words Total_characters Average_no_words Size
## <fct> <int> <int> <int> <dbl> <dbl>
## 1 en_US.tw~ 2360148 30373583 162096241 13 159
## 2 en_US.ne~ 77259 2643969 15639408 34 196
## 3 en_US.bl~ 899288 37334131 206824505 42 200
There are more than 2 million lines in twitter file and 90 lakh lines in blogs.
There are more than 30 million words in each of these text files.
The sentences in the blogs are more lenghthy compared to that of others.Average number of words per sentence in blogs is 42 while it is 13 words in twitter.
Total size of the three files is more than 600 MB. Blogs occupies maximum size of 200 MB that is approx.1/3rd of size of all the files.
As it is computationally expensive to explore data on the total size of the files, we are taking 50,000 lines as sample.
We are taken number of lines in the inverse proportion of average number of words per sentence (76/89,55/89,47/89) from twitter,news and blogs respectively to create uniform sample text. Using rbinom function with 100000 trials, we’ve drawn random normalised data from the files at first.
set.seed(1912)
# Selected 25000 lines from each of three files to create text sample using binomial distribution
blog_idx <- rbinom(length(blogdata),100000,0.5)
news_idx <- rbinom(length(newsdata),100000,0.5)
tweet_idx <- rbinom(length(tweetdata),100000,0.5)
text_directory<- c(sample(blogdata[blog_idx],round(25000*47/89),replace = FALSE),sample(newsdata[news_idx],round(25000*55/89),replace = FALSE),
sample(tweetdata[tweet_idx],round(25000*76/89),replace = FALSE))# sample of text data
#blog sentence from the text sample
text_directory[1]
## [1] "The reason Osama bin Laden “sleeps with the fishes” is that there are scores of intelligence analysts whose sole job is to connect the dots from a mountain of information that flows into the CIA, the NSA, the Pentagon, the State Department, the FBI, and other government agencies charged with keeping us alive."
#news sentence from the text sample
text_directory[13203]
## [1] "Earnings season starts with aluminum producer Alcoa Inc. on Monday. The company's stock rose 1.9 percent ahead of its report. Other companies scheduled to release results next week include banking giants JPMorgan Chase & Co. and Bank of America Corp. General Electric Co. and chipmaker Intel Corp. are also scheduled to report earnings next week."
#tweet sentence from the text sample
text_directory[49999]
## [1] "The Girl with the Bacon Tattoo."
cat(text_directory, file = (con <- file("textsample.txt", "w+", encoding = "UTF-8")), sep = " ")
textsample <- readLines(con<-file("textsample.txt",encoding = "UTF-8"));close(con)
rm(blog_idx,news_idx,tweet_idx)
rm(blogdata,newsdata,tweetdata)
rm(text_directory)
set.seed(1912)
library(NLP)
library(tm)# tm package in R for text data mining
corp<- VCorpus(VectorSource(textsample),list(readPlain,language="en"))
corp<-tm_map(corp,content_transformer(tolower))
reg_exp <- content_transformer(function(x,pattern) gsub(pattern,replacement =" ",x))
corp <- tm_map(corp,reg_exp,"[[:alnum:]]+\\@[[:alpha:]]+\\.com")
corp<- tm_map(corp,reg_exp,"[^a-z\\s]+")
corp <- tm_map(corp,reg_exp,"\\(.*?\\)")
corp<-tm_map(corp,reg_exp,"[[:punct:]]+")
t1 = Sys.time()
library(stopwords,warn.conflicts = FALSE)
set.seed(201911)
corp<- tm_map(corp,removeWords,stopwords(language = "en",source = "smart"))
## profanity list of words downloaded from
con<- file("/Users/fathims/Desktop/R Practice/bad-words.txt","r")
profanityWords <- readLines(con)
close(con = con)
corp<- tm_map(corp,removeWords,profanityWords)
rm(profanityWords)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 34.03223 secs
t1 = Sys.time()
options(mc.cores=1)
set.seed(201912)
library(RWeka)
corp<-tm_map(corp,removeNumbers)
corp<-tm_map(corp,stripWhitespace)
corp<-tm_map(corp,PlainTextDocument)
tdm <- TermDocumentMatrix(corp)
#Tokenizing words of the Term Document Matrix
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
onegram_tdm <-removeSparseTerms(tdm, 0.9999)# Tokenizing not required for one gram
bigram_tdm <- removeSparseTerms(TermDocumentMatrix(corp, control = list(tokenize = bigram)), 0.9999)
trigram_tdm <- removeSparseTerms(TermDocumentMatrix(corp, control = list(tokenize = trigram)),0.9999)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 557.6852 secs
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
getFreq <- function(x){
v <- sort(rowSums(as.matrix(x)),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
return(d)
}
# getting the frequencies
freq1 <- getFreq(onegram_tdm)
freq2 <- getFreq(bigram_tdm)
freq3 <- getFreq(trigram_tdm)
makePlot <- function(data, x_label) {
ggplot(data[1:40,], aes(reorder(word, -freq), freq)) +
labs(x = x_label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size =9, hjust = 1))+geom_bar(stat = "identity",fill=I("gray"))
}
#Histogram of the 40 most common 1-gram grams in the data sample
makePlot(freq1, "1-gram words")
#Histogram of the 40 most common bigrams in the data sample
makePlot(freq2, "2-grams words")
#Histogram of the 40 most common trigrams in the data sample
makePlot(freq3, "3-grams words")
t1 = Sys.time()
library(RColorBrewer)
library(wordcloud)
v <- sort(rowSums(as.matrix(onegram_tdm)),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
v2 <- sort(rowSums(as.matrix(bigram_tdm)),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
# Most frequent 1-grams
findFreqTerms(onegram_tdm,lowfreq=500,highfreq=Inf)[1:10]
## [1] "ago" "back" "bad" "big" "book" "bring"
## [7] "business" "children" "city" "county"
#Word Cloud of more frequent 1-gram words
wordcloud(d$word, d$freq,max.words=80, min.freq=500,scale=c(1.5,.85),random.order=FALSE,rot.per=.25,use.r.layout=TRUE)
# Most frequent 2-grams
findFreqTerms(bigram_tdm,lowfreq=71,highfreq=Inf)[1:10]
## [1] "back people" "belle isle" "bono mack"
## [4] "book ve" "box set" "breaking ground"
## [7] "cell phone" "cell phones" "cessation prevention"
## [10] "choices make"
#Word Cloud of more frequent 2-gram words
wordcloud(d2$word, d2$freq,max.words=20, min.freq=71,scale=c(1.5,.85),random.order=FALSE,rot.per=.25,use.r.layout=TRUE)
# Most frequent 3-grams
findFreqTerms(trigram_tdm,lowfreq=61,highfreq=Inf)[1:10]
## [1] "add arsenal photo" "aren send money"
## [3] "arsenal photo gear" "ate chocolate boys"
## [5] "awake morning work" "barack rep usa"
## [7] "bought skinnygirl pi" "boys halloween stash"
## [9] "central west end" "chocolate boys halloween"
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 2.363206 secs
predict the next word. If there are no matching trigrams,the algorithm would back off to the bigram model, and lastly to the unigram model.
the n-gram algorithm (algorith predict the words based on the probabilities of next word(s) given a word is already occured)to
suggest the most likely next word after a short time delay.