library(tm)
library(plyr)
library(dplyr)
library(wordcloud)
library(ggplot2)
library(plotly)
library(RWeka)
library(SnowballC)
Data source: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
In this first phase of the project, we will be reading the files and extracting some basic information, such as size of the files, number of lines, number of words and more.
us_blogs_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
us_news_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"
us_twitter_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
con_blog <- file(us_blogs_loc) ; con_news <- file(us_news_loc) ; con_twit <- file(us_twitter_loc)
# File size in MB
Size_MegaBytes <- c(file.info(us_blogs_loc)$size/1048576,file.info(us_news_loc)$size/1048576,file.info(us_twitter_loc)$size/1048576)
# Read the files
blog <- readLines(con_blog, encoding = "UTF-8")
news <- readLines(con_news, encoding = "UTF-8")
twit <- readLines(con_twit, encoding = "UTF-8")
# Number of lines
Number_of_Lines <- c(length(blog), length(news),length(twit))
# Max Number of words in a line
Max_Words_in_Line <- c(max(unlist(lapply(blog,nchar))),max(unlist(lapply(news,nchar))), max(unlist(lapply(twit,nchar))))
# Total word count
Total_Number_of_Words <- c(ngram::wordcount(blog),ngram::wordcount(news),ngram::wordcount(twit))
Average_Words_per_Line <- Total_Number_of_Words/Number_of_Lines
data_table_combined <- data.frame(Source=c("Blogs","News","Twitter"),Size_MegaBytes,Number_of_Lines,Max_Words_in_Line,Total_Number_of_Words,Average_Words_per_Line)
knitr::kable(data_table_combined)
Source | Size_MegaBytes | Number_of_Lines | Max_Words_in_Line | Total_Number_of_Words | Average_Words_per_Line |
---|---|---|---|---|---|
Blogs | 200.4242 | 899288 | 40833 | 37334131 | 41.51521 |
News | 196.2775 | 77259 | 5760 | 2643969 | 34.22215 |
159.3641 | 2360148 | 140 | 30373543 | 12.86934 |
set.seed(12345)
sample_size <- 20000
sblog <- sample(blog,sample_size)
snews <- sample(news,sample_size)
stwit <- sample(twit,sample_size)
iblog <- iconv(sblog,"UTF-8","ASCII","byte")
inews <- iconv(snews,"UTF-8","ASCII","byte")
itwit <- iconv(stwit,"UTF-8","ASCII","byte")
# This step would help in development.
# Each effort would start from reading the created sample file,
# eliminating the time to read all the files in each session
#file_loc <- "D:/capstone/US_all_sample_data.txt"
#writeLines(US_all_sample_data,file_loc)
US_all_sample_data <- c(iblog,inews,itwit)
rm(blog,news,twit)
close(con_twit)
close(con_blog)
close(con_news)
US_corpus_20k <- VCorpus(VectorSource(US_all_sample_data))
According to “tm” package vignette, some necessary transformations are in order:
US_corpus_20k <- tm_map(US_corpus_20k,removeNumbers)
US_corpus_20k <- tm_map(US_corpus_20k,content_transformer(tolower))
US_corpus_20k <- tm_map(US_corpus_20k,removeWords,stopwords("english"))
US_corpus_20k <- tm_map(US_corpus_20k,removePunctuation)
US_corpus_20k <- tm_map(US_corpus_20k,stripWhitespace)
US_corpus_20k <- tm_map(US_corpus_20k,PlainTextDocument)
wordcloud(US_corpus_20k, scale=c(4,0.5), min.freq=10, max.words=150, random.order=TRUE,
rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)
Next is the tokenization of the corpus. Two possible alternatives identified were to use either TDM (Term Document Matrix) or a data frame.
df_US_corpus_20k <- data.frame(text = unlist(sapply(US_corpus_20k, '[', 'content')), stringsAsFactors = F)
df_uniGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 1, max = 1))))
df_biGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 2, max = 2))))
df_triGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 3, max = 3))))
# for dev purposes
#write.csv(df_uniGramToken,'unigram.csv')
#write.csv(df_biGramToken,'bigram.csv')
#write.csv(df_triGramToken,'trigram.csv')
#useful during dev
#df_uniGramToken <- read.csv('unigram.csv')
#order by decreasing frequency and keep 25, removing "e"
uniG <- head(df_uniGramToken[order(-df_uniGramToken$Freq),],26)
names(uniG) <- c("word","frequency")
data <- uniG %>% filter(word != 'e')
ggplot2::ggplot(data, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("UniGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)
#useful during dev
#df_biGramToken <- read.csv('bigram.csv')
#order by decreasing frequency
BiGram <- head(df_biGramToken[order(-df_biGramToken$Freq),],25)
names(BiGram) <- c("word","frequency")
ggplot2::ggplot(BiGram, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("BiGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)
#useful during dev
#df_triGramToken <- read.csv('trigram.csv')
#order by decreasing frequency
TriGram <- head(df_triGramToken[order(-df_triGramToken$Freq),],25)
names(TriGram) <- c("word","frequency")
ggplot2::ggplot(TriGram, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("TriGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)
In this work, we have read three (3) different data sources (blogs, news and Twitter feeds). For all of them we took a sample, merged them into a single block of text (the corpus) and on this text we performed certain transformations (removed numbers, stop-words, punctuation and converted to lowercase)
For the merged dataset (the corpus) created, we plotted a “word cloud”, which was a color plot of the 150 most common words, with the words with higher frequency having larger size in the plot. We then “tokenized” the corpus in order to extract the frequency of the most common unigrams, bigrams and trigrams, so the most common words and combinations of 2 and 3 words respectively.
There were some basic assumptions in place, for the exploratory data analysis. Those were:
Yet remains to create the TDM (Document-term Matrix, https://en.wikipedia.org/wiki/Document-term_matrix ) from which we can further eliminate sparse terms. Also the sparsity of the terms can be compared to samples of different sizes, to see what is the ideal sample, that is neither too small, nor generates much fewer sparse terms than samples of larger size.
With the correct sample in space, we will be defining the problem of creating the n-grams and removing the last word in each, and then attempt to predict it with an algorithm.
N.Perdikis