Burhan Khan
February 19, 2018
Summary
This report explains basic steps for text mining analysis consist of:
- Load training datasets
- Basic analysis for datasets such as word count, line count, size of the file
- Data manipulation including limit our sample size, aggregate our dataset, cleaning dataset, corpus creation and build our n-gram model
- Data Exploration
- Insights
- Way forward
Loading libraries
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
library(wordcloud)
Loading our data set for Twitter, blog and news.
ds_us_twitter <- file("D:\\Data\\final\\en_US\\en_US.twitter.txt")
ds_us_blog <- file("D:\\Data\\final\\en_US\\en_US.blogs.txt")
ds_us_news <- file("D:\\Data\\final\\en_US\\en_US.news.txt")
us_twitter <- readLines(ds_us_twitter)
us_blog <- readLines(ds_us_blog)
us_news <- readLines(ds_us_news)
Line Counts
Twitter data line count
twitter_linecount <- length(us_twitter)
twitter_linecount
[1] 2360148
Blog data line count
blog_linecount <- length(us_blog)
blog_linecount
[1] 899288
News data line count
news_linecount <- length(us_news)
news_linecount
[1] 1010242
Words count
twitter_wordcount <- sum(stri_count_words(us_twitter))
blog_wordcount <- sum(stri_count_words(us_blog))
news_wordcount <- sum(stri_count_words(us_news))
cat('Twitter data count: ', twitter_wordcount)
Twitter data count: 30218125
cat('\nBlog data word count: ',blog_wordcount)
Blog data word count: 38154238
cat('\nNews data word count: ',news_wordcount)
News data word count: 35010786
Calculating file size in MB
twitter_size <- file.info("D:\\Data\\final\\en_US\\en_US.twitter.txt")$size/1024000
blog_size <- file.info("D:\\Data\\final\\en_US\\en_US.blogs.txt")$size/1024000
news_size <- file.info("D:\\Data\\final\\en_US\\en_US.news.txt")$size/1024000
Summarizing
basic_summary_table <- data.frame(Dataset.Name=c("Twitter","Blogs","News"),Dataset.Size.MB=format(c(twitter_size,blog_size,news_size),digits = 5),Number.of.Lines=format(c(twitter_linecount,blog_linecount,news_linecount),big.mark = ","),Word.Count=format(c(twitter_wordcount,blog_wordcount,news_wordcount),big.mark = ","))
knitr::kable(basic_summary_table)
| Twitter |
163.19 |
2,360,148 |
30,218,125 |
| Blogs |
205.23 |
899,288 |
38,154,238 |
| News |
200.99 |
1,010,242 |
35,010,786 |
Segregating data into test and train, taking 70% data for train and 20% for test
Also, we are taking only Blog data for illustration purpose.
train_blog <- sample(us_blog, round(length(us_blog)*.7))
Cleaning the data and setting the default encoding
train_blog <- iconv(train_blog,"UTF-8","ASCII","byte")
Creating our corpus
master_corpus <- Corpus(VectorSource(sample(train_blog)))
We will be using only 5000 lines of data from out train for illustration purpose
main_corpus<-sample(master_corpus,5000)
Standardizing our Corpus to lower case, removing puntuatuations, numbers and white spaces.
main_corpus <- tm_map(main_corpus, tolower)
main_corpus <- tm_map(main_corpus, removePunctuation)
main_corpus <- tm_map(main_corpus, removeWords, stopwords("en"))
main_corpus <- tm_map(main_corpus, removeNumbers)
main_corpus <- tm_map(main_corpus, stripWhitespace)
Create tokenizer function for uni-gram, bi-gram and tri-gram
unigram_token <- NGramTokenizer(main_corpus, Weka_control(min=1, max=1))
bigram_token <- NGramTokenizer(main_corpus, Weka_control(min=2, max=2))
trigram_token <- NGramTokenizer(main_corpus, Weka_control(min=3, max=3))
# Construct data frame for each n-gram
unigram_df <- data.frame(table(unigram_token))
bigram_df <- data.frame(table(bigram_token))
trigram_df <- data.frame(table(trigram_token))
Generating Word Cloud for out train data
wordcloud(main_corpus, max.words=100, random.order=FALSE,colors=brewer.pal(8, "Dark2"))

Get top 20 most frequent words
hplot <- unigram_df[order(-unigram_df$Freq),][1:20,]
ggplot(hplot, aes(reorder(unigram_token, -Freq),Freq)) + geom_bar(stat = "Identity", fill = "Blue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x="1-gram", y="Frequency", title="Top Most 20 Unigram Word")

Get top 20 most frequent words
hplot <- bigram_df[order(-bigram_df$Freq),][1:20,]
ggplot(hplot, aes(reorder(bigram_token, -Freq),Freq)) + geom_bar(stat = "Identity", fill = "Blue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x="2-gram", y="Frequency", title="Top Most 20 Bigram Word")

Get top 20 most frequent words
hplot <- trigram_df[order(-trigram_df$Freq),][1:20,]
ggplot(hplot, aes(reorder(trigram_token, -Freq),Freq)) + geom_bar(stat = "Identity", fill = "Blue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x="3-gram", y="Frequency", title="Top Most 20 Trigram Word")

Findings
Most of unigram words used commonly in everyday communication There are few places name appear in bigram model and all of it resides in US as our sample dataset comes from US tweet, blogs and news
Way Forward
As we already understand some basic nature of our dataset, next we have to identify how to improve our analysis performance and minimize the footprint when we run our predective text model online with shiny server. Final goal is to take 3 words from the user and predict the next word.
LS0tDQp0aXRsZTogIk1pbGVzdG9uZSByZXBvcnQiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpCdXJoYW4gS2hhbg0KDQpGZWJydWFyeSAxOSwgMjAxOA0KDQoNCjxoMj5TdW1tYXJ5IDwvaDI+DQoNCjxoMz5UaGlzIHJlcG9ydCBleHBsYWlucyBiYXNpYyBzdGVwcyBmb3IgdGV4dCBtaW5pbmcgYW5hbHlzaXMgY29uc2lzdCBvZjo8L2gzPg0KDQoxLiBMb2FkIHRyYWluaW5nIGRhdGFzZXRzDQoyLiBCYXNpYyBhbmFseXNpcyBmb3IgZGF0YXNldHMgc3VjaCBhcyB3b3JkIGNvdW50LCBsaW5lIGNvdW50LCBzaXplIG9mIHRoZSBmaWxlDQozLiBEYXRhIG1hbmlwdWxhdGlvbiBpbmNsdWRpbmcgbGltaXQgb3VyIHNhbXBsZSBzaXplLCBhZ2dyZWdhdGUgb3VyIGRhdGFzZXQsIGNsZWFuaW5nIGRhdGFzZXQsIGNvcnB1cyBjcmVhdGlvbiBhbmQgYnVpbGQgb3VyIG4tZ3JhbSBtb2RlbA0KNC4gRGF0YSBFeHBsb3JhdGlvbiANCjUuIEluc2lnaHRzDQo2LiBXYXkgZm9yd2FyZA0KDQoNCkxvYWRpbmcgbGlicmFyaWVzIA0KYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoc3RyaW5naSkNCmxpYnJhcnkodG0pDQpsaWJyYXJ5KFJXZWthKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQpgYGANCg0KDQoNCkxvYWRpbmcgb3VyIGRhdGEgc2V0IGZvciBUd2l0dGVyLCBibG9nIGFuZCBuZXdzLg0KDQpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KZHNfdXNfdHdpdHRlciA8LSBmaWxlKCJEOlxcRGF0YVxcZmluYWxcXGVuX1VTXFxlbl9VUy50d2l0dGVyLnR4dCIpDQpkc191c19ibG9nIDwtIGZpbGUoIkQ6XFxEYXRhXFxmaW5hbFxcZW5fVVNcXGVuX1VTLmJsb2dzLnR4dCIpDQpkc191c19uZXdzIDwtIGZpbGUoIkQ6XFxEYXRhXFxmaW5hbFxcZW5fVVNcXGVuX1VTLm5ld3MudHh0IikNCiANCnVzX3R3aXR0ZXIgPC0gcmVhZExpbmVzKGRzX3VzX3R3aXR0ZXIpDQp1c19ibG9nIDwtIHJlYWRMaW5lcyhkc191c19ibG9nKQ0KdXNfbmV3cyA8LSByZWFkTGluZXMoZHNfdXNfbmV3cykNCmBgYA0KDQo8aDM+TGluZSBDb3VudHM8L2gzPg0KDQpUd2l0dGVyIGRhdGEgbGluZSBjb3VudA0KYGBge3J9DQp0d2l0dGVyX2xpbmVjb3VudCA8LSBsZW5ndGgodXNfdHdpdHRlcikNCnR3aXR0ZXJfbGluZWNvdW50DQpgYGANCg0KQmxvZyBkYXRhIGxpbmUgY291bnQNCmBgYHtyfQ0KYmxvZ19saW5lY291bnQgPC0gbGVuZ3RoKHVzX2Jsb2cpDQpibG9nX2xpbmVjb3VudA0KYGBgDQoNCk5ld3MgZGF0YSBsaW5lIGNvdW50DQpgYGB7cn0NCm5ld3NfbGluZWNvdW50IDwtIGxlbmd0aCh1c19uZXdzKQ0KbmV3c19saW5lY291bnQNCmBgYA0KDQo8aDM+V29yZHMgY291bnQ8L2gzPg0KYGBge3J9DQp0d2l0dGVyX3dvcmRjb3VudCA8LSBzdW0oc3RyaV9jb3VudF93b3Jkcyh1c190d2l0dGVyKSkNCmJsb2dfd29yZGNvdW50IDwtIHN1bShzdHJpX2NvdW50X3dvcmRzKHVzX2Jsb2cpKQ0KbmV3c193b3JkY291bnQgPC0gc3VtKHN0cmlfY291bnRfd29yZHModXNfbmV3cykpDQoNCmNhdCgnVHdpdHRlciBkYXRhIGNvdW50OiAnLCB0d2l0dGVyX3dvcmRjb3VudCkNCg0KY2F0KCdcbkJsb2cgZGF0YSB3b3JkIGNvdW50OiAnLGJsb2dfd29yZGNvdW50KQ0KDQpjYXQoJ1xuTmV3cyBkYXRhIHdvcmQgY291bnQ6ICcsbmV3c193b3JkY291bnQpDQpgYGANCg0KPGgzPkNhbGN1bGF0aW5nIGZpbGUgc2l6ZSBpbiBNQjwvaDM+DQpgYGB7cn0NCnR3aXR0ZXJfc2l6ZSA8LSBmaWxlLmluZm8oIkQ6XFxEYXRhXFxmaW5hbFxcZW5fVVNcXGVuX1VTLnR3aXR0ZXIudHh0Iikkc2l6ZS8xMDI0MDAwDQpibG9nX3NpemUgPC0gZmlsZS5pbmZvKCJEOlxcRGF0YVxcZmluYWxcXGVuX1VTXFxlbl9VUy5ibG9ncy50eHQiKSRzaXplLzEwMjQwMDANCm5ld3Nfc2l6ZSA8LSBmaWxlLmluZm8oIkQ6XFxEYXRhXFxmaW5hbFxcZW5fVVNcXGVuX1VTLm5ld3MudHh0Iikkc2l6ZS8xMDI0MDAwDQpgYGANCg0KPGgzPjxiPlN1bW1hcml6aW5nPC9iPjwvaDM+DQpgYGB7cn0NCmJhc2ljX3N1bW1hcnlfdGFibGUgPC0gZGF0YS5mcmFtZShEYXRhc2V0Lk5hbWU9YygiVHdpdHRlciIsIkJsb2dzIiwiTmV3cyIpLERhdGFzZXQuU2l6ZS5NQj1mb3JtYXQoYyh0d2l0dGVyX3NpemUsYmxvZ19zaXplLG5ld3Nfc2l6ZSksZGlnaXRzID0gNSksTnVtYmVyLm9mLkxpbmVzPWZvcm1hdChjKHR3aXR0ZXJfbGluZWNvdW50LGJsb2dfbGluZWNvdW50LG5ld3NfbGluZWNvdW50KSxiaWcubWFyayA9ICIsIiksV29yZC5Db3VudD1mb3JtYXQoYyh0d2l0dGVyX3dvcmRjb3VudCxibG9nX3dvcmRjb3VudCxuZXdzX3dvcmRjb3VudCksYmlnLm1hcmsgPSAiLCIpKQ0KDQprbml0cjo6a2FibGUoYmFzaWNfc3VtbWFyeV90YWJsZSkNCmBgYA0KDQoNCjxoMz5TZWdyZWdhdGluZyBkYXRhIGludG8gdGVzdCBhbmQgdHJhaW4sIHRha2luZyA3MCUgZGF0YSBmb3IgdHJhaW4gYW5kIDIwJSBmb3IgdGVzdDwvaDM+DQoNCjxiPkFsc28sIHdlIGFyZSB0YWtpbmcgb25seSBCbG9nIGRhdGEgZm9yIGlsbHVzdHJhdGlvbiBwdXJwb3NlLjwvYj4NCmBgYHtyfQ0KdHJhaW5fYmxvZyA8LSBzYW1wbGUodXNfYmxvZywgcm91bmQobGVuZ3RoKHVzX2Jsb2cpKi43KSkNCmBgYA0KDQo8aDM+Q2xlYW5pbmcgdGhlIGRhdGEgYW5kIHNldHRpbmcgdGhlIGRlZmF1bHQgZW5jb2Rpbmc8L2gzPg0KYGBge3J9DQp0cmFpbl9ibG9nIDwtIGljb252KHRyYWluX2Jsb2csIlVURi04IiwiQVNDSUkiLCJieXRlIikNCmBgYA0KDQo8aDM+Q3JlYXRpbmcgb3VyIGNvcnB1czwvaDM+DQpgYGB7cn0NCm1hc3Rlcl9jb3JwdXMgPC0gQ29ycHVzKFZlY3RvclNvdXJjZShzYW1wbGUodHJhaW5fYmxvZykpKQ0KYGBgDQoNCg0KDQo8Yj5XZSB3aWxsIGJlIHVzaW5nIG9ubHkgNTAwMCBsaW5lcyBvZiBkYXRhIGZyb20gb3V0IHRyYWluIGZvciBpbGx1c3RyYXRpb24gcHVycG9zZTwvYj4NCg0KYGBge3J9DQptYWluX2NvcnB1czwtc2FtcGxlKG1hc3Rlcl9jb3JwdXMsNTAwMCkNCmBgYA0KDQo8aDM+U3RhbmRhcmRpemluZyBvdXIgQ29ycHVzIHRvIGxvd2VyIGNhc2UsIHJlbW92aW5nIHB1bnR1YXR1YXRpb25zLCBudW1iZXJzIGFuZCB3aGl0ZSBzcGFjZXMuPC9oMz4NCmBgYHtyfQ0KbWFpbl9jb3JwdXMgPC0gdG1fbWFwKG1haW5fY29ycHVzLCB0b2xvd2VyKQ0KDQptYWluX2NvcnB1cyA8LSB0bV9tYXAobWFpbl9jb3JwdXMsIHJlbW92ZVB1bmN0dWF0aW9uKQ0KDQptYWluX2NvcnB1cyA8LSB0bV9tYXAobWFpbl9jb3JwdXMsIHJlbW92ZVdvcmRzLCBzdG9wd29yZHMoImVuIikpDQoNCm1haW5fY29ycHVzIDwtIHRtX21hcChtYWluX2NvcnB1cywgcmVtb3ZlTnVtYmVycykNCg0KbWFpbl9jb3JwdXMgPC0gdG1fbWFwKG1haW5fY29ycHVzLCBzdHJpcFdoaXRlc3BhY2UpDQpgYGANCg0KDQo8aDM+Q3JlYXRlIHRva2VuaXplciBmdW5jdGlvbiBmb3IgdW5pLWdyYW0sIGJpLWdyYW0gYW5kIHRyaS1ncmFtPC9oMz4NCmBgYHtyfQ0KdW5pZ3JhbV90b2tlbiA8LSBOR3JhbVRva2VuaXplcihtYWluX2NvcnB1cywgV2VrYV9jb250cm9sKG1pbj0xLCBtYXg9MSkpDQpiaWdyYW1fdG9rZW4gPC0gTkdyYW1Ub2tlbml6ZXIobWFpbl9jb3JwdXMsIFdla2FfY29udHJvbChtaW49MiwgbWF4PTIpKQ0KdHJpZ3JhbV90b2tlbiA8LSBOR3JhbVRva2VuaXplcihtYWluX2NvcnB1cywgV2VrYV9jb250cm9sKG1pbj0zLCBtYXg9MykpDQojIENvbnN0cnVjdCBkYXRhIGZyYW1lIGZvciBlYWNoIG4tZ3JhbQ0KdW5pZ3JhbV9kZiA8LSBkYXRhLmZyYW1lKHRhYmxlKHVuaWdyYW1fdG9rZW4pKQ0KYmlncmFtX2RmIDwtIGRhdGEuZnJhbWUodGFibGUoYmlncmFtX3Rva2VuKSkNCnRyaWdyYW1fZGYgPC0gZGF0YS5mcmFtZSh0YWJsZSh0cmlncmFtX3Rva2VuKSkNCmBgYA0KDQo8aDM+R2VuZXJhdGluZyBXb3JkIENsb3VkIGZvciBvdXQgdHJhaW4gZGF0YTwvaDM+DQpgYGB7cn0NCndvcmRjbG91ZChtYWluX2NvcnB1cywgbWF4LndvcmRzPTEwMCwgcmFuZG9tLm9yZGVyPUZBTFNFLGNvbG9ycz1icmV3ZXIucGFsKDgsICJEYXJrMiIpKQ0KYGBgDQoNCjxoMz5HZXQgdG9wIDIwIG1vc3QgZnJlcXVlbnQgd29yZHM8L2gzPg0KYGBge3J9DQpocGxvdCA8LSB1bmlncmFtX2RmW29yZGVyKC11bmlncmFtX2RmJEZyZXEpLF1bMToyMCxdDQoNCmdncGxvdChocGxvdCwgYWVzKHJlb3JkZXIodW5pZ3JhbV90b2tlbiwgLUZyZXEpLEZyZXEpKSArIGdlb21fYmFyKHN0YXQgPSAiSWRlbnRpdHkiLCBmaWxsID0gIkJsdWUiKSArIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIGhqdXN0ID0gMSkpICsgbGFicyh4PSIxLWdyYW0iLCB5PSJGcmVxdWVuY3kiLCB0aXRsZT0iVG9wIE1vc3QgMjAgVW5pZ3JhbSBXb3JkIikNCmBgYA0KDQo8aDM+R2V0IHRvcCAyMCBtb3N0IGZyZXF1ZW50IHdvcmRzPC9oMz4NCmBgYHtyfQ0KaHBsb3QgPC0gYmlncmFtX2RmW29yZGVyKC1iaWdyYW1fZGYkRnJlcSksXVsxOjIwLF0NCg0KZ2dwbG90KGhwbG90LCBhZXMocmVvcmRlcihiaWdyYW1fdG9rZW4sIC1GcmVxKSxGcmVxKSkgKyBnZW9tX2JhcihzdGF0ID0gIklkZW50aXR5IiwgZmlsbCA9ICJCbHVlIikgKyB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDQ1LCBoanVzdCA9IDEpKSArIGxhYnMoeD0iMi1ncmFtIiwgeT0iRnJlcXVlbmN5IiwgdGl0bGU9IlRvcCBNb3N0IDIwIEJpZ3JhbSBXb3JkIikNCmBgYA0KDQo8aDM+R2V0IHRvcCAyMCBtb3N0IGZyZXF1ZW50IHdvcmRzPC9oMz4NCmBgYHtyfQ0KaHBsb3QgPC0gdHJpZ3JhbV9kZltvcmRlcigtdHJpZ3JhbV9kZiRGcmVxKSxdWzE6MjAsXQ0KDQpnZ3Bsb3QoaHBsb3QsIGFlcyhyZW9yZGVyKHRyaWdyYW1fdG9rZW4sIC1GcmVxKSxGcmVxKSkgKyBnZW9tX2JhcihzdGF0ID0gIklkZW50aXR5IiwgZmlsbCA9ICJCbHVlIikgKyB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDQ1LCBoanVzdCA9IDEpKSArIGxhYnMoeD0iMy1ncmFtIiwgeT0iRnJlcXVlbmN5IiwgdGl0bGU9IlRvcCBNb3N0IDIwIFRyaWdyYW0gV29yZCIpDQpgYGANCg0KDQo8aDI+RmluZGluZ3M8L2gyPg0KTW9zdCBvZiB1bmlncmFtIHdvcmRzIHVzZWQgY29tbW9ubHkgaW4gZXZlcnlkYXkgY29tbXVuaWNhdGlvbg0KVGhlcmUgYXJlIGZldyBwbGFjZXMgbmFtZSBhcHBlYXIgaW4gYmlncmFtIG1vZGVsIGFuZCBhbGwgb2YgaXQgcmVzaWRlcyBpbiBVUyBhcyBvdXIgc2FtcGxlIGRhdGFzZXQgY29tZXMgZnJvbSBVUyB0d2VldCwgYmxvZ3MgYW5kIG5ld3MNCg0KPGgyPldheSBGb3J3YXJkPC9oMj4NCkFzIHdlIGFscmVhZHkgdW5kZXJzdGFuZCBzb21lIGJhc2ljIG5hdHVyZSBvZiBvdXIgZGF0YXNldCwgbmV4dCB3ZSBoYXZlIHRvIGlkZW50aWZ5IGhvdyB0byBpbXByb3ZlIG91ciBhbmFseXNpcyBwZXJmb3JtYW5jZSBhbmQgbWluaW1pemUgdGhlIGZvb3RwcmludCB3aGVuIHdlIHJ1biBvdXIgcHJlZGVjdGl2ZSB0ZXh0IG1vZGVsIG9ubGluZSB3aXRoIHNoaW55IHNlcnZlci4NCkZpbmFsIGdvYWwgaXMgdG8gdGFrZSAzIHdvcmRzIGZyb20gdGhlIHVzZXIgYW5kIHByZWRpY3QgdGhlIG5leHQgd29yZC4NCg0K