The captsone project focuses on utilizing data science techniques on the subject of natural language processing. The key task is to create an application using R Shiny to predict the next word given a preceding word or sequence of words. SwiftKey, the corporate sponsor for this project.
# Loading required libraries
library(tm);
## Loading required package: NLP
library(ngram);
library(wordcloud);
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
En_Twit_text <- readLines("./data/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
En_US_blogs_text <- readLines("./data/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
En_US_NEWS_text <- readLines("./data/en_US/en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)
#Random Sampling from the three text files to be used for model building
twitter_sample = sample(En_Twit_text, length(En_Twit_text)*0.015, replace = FALSE)
news_sample = sample(En_US_NEWS_text, length(En_US_NEWS_text)*0.015, replace = FALSE)
blogs_sample = sample(En_US_blogs_text, length(En_US_blogs_text)*0.015, replace = FALSE)
# Creating a corpus for text mining and pre-processing
sample_files = c(twitter_sample,news_sample,blogs_sample)
files = Corpus(VectorSource(sample_files))
df.nwords.all <- data.frame(nword = c(twitter_sample, news_sample, blogs_sample),
type = c(rep("blog", length(blogs_sample)), rep("twitter",length(twitter_sample)),
rep("web", length(news_sample))))
make_Corpus<- function(test_file) {
gen_corp<- paste(test_file, collapse=" ")
gen_corp <- VectorSource(gen_corp)
gen_corp <- Corpus(gen_corp)
}
clean_corp <- function(corp_data) {
WordSeparators <- "[[:punct:]]|\u00ad|\u0091|\u0092|\u0093|\u0094|\u0095|\u0096|\u0097|\u0098|\u00a6"
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, ' ', x))})
corp_data <- tm_map(corp_data, toSpace, WordSeparators)
corp_data <- tm_map(corp_data, removeNumbers)
corp_data <- tm_map(corp_data, content_transformer(tolower))
corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
corp_data <- tm_map(corp_data, removePunctuation)
corp_data <- tm_map(corp_data, stripWhitespace)
corp_data <- tm_map(corp_data, PlainTextDocument)
return (corp_data)
}
high_freq_words <- function (corp_data) {
term_sparse <- DocumentTermMatrix(corp_data)
term_matrix <- as.matrix(term_sparse) ## convert our term-document-matrix into a normal matrix
freq_words <- colSums(term_matrix)
freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
freq_words$word <- rownames(freq_words)
colnames(freq_words) <- c("Frequency","word")
return (freq_words)
}
## en_US.news.txt High frequency words
US_news_corpus <- make_Corpus(news_sample)
US_news_corpus <- clean_corp(US_news_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, toSpace, WordSeparators):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, PlainTextDocument):
## transformation drops documents
US_news_most_used_word <- high_freq_words(US_news_corpus)
US_news_most_used_word1<- US_news_most_used_word[1:15,]
p<-ggplot(data=US_news_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent words : US News") +theme(legend.title=element_blank()) + coord_flip()
## en_US.blogs.txt High frequency words
US_blogs_corpus <- make_Corpus(blogs_sample)
US_blogs_corpus <- clean_corp(US_blogs_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, toSpace, WordSeparators):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, PlainTextDocument):
## transformation drops documents
US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]
p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent words : US blogs") +theme(legend.title=element_blank()) + coord_flip()
## en_US.twitter.txt High frequency words
twitter_corpus <- make_Corpus(twitter_sample)
twitter_corpus <- clean_corp(twitter_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, toSpace, WordSeparators):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, PlainTextDocument):
## transformation drops documents
twitter_most_used_word <- high_freq_words(twitter_corpus)
twitter_most_used_word1<- twitter_most_used_word[1:15,]
p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent words : Twitter") +theme(legend.title=element_blank()) + coord_flip()
## US News Word Cloud
wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
## US Blogs Word Cloud
wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
## US Twitter Word Cloud
wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.
## news High frequency words
US_News_tokens<- tokens(news_sample,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
US_News_tokens <- tokens_tolower(US_News_tokens)
US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")
US_News_unigram <- tokens_ngrams(US_News_tokens, n=1) ## unigram
US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_News_bigram <- tokens_ngrams(US_News_tokens, n=2) ## bigram
US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_News_trigram <- tokens_ngrams(US_News_tokens, n=3) ## trigram
US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(US_News_unigram.dfm, 20) # 20 top US News Unigram words
## said one new also two can year just first time
## 3793 1249 1035 925 859 831 831 810 801 783
## people last state years like get school make city now
## 736 728 726 690 689 614 565 560 559 526
topfeatures(US_News_trigram.dfm, 20) # 20 top US News Trigram words
## president_barack_obama two_years_ago
## 28 25
## gov_chris_christie new_york_city
## 24 20
## four_years_ago county_prosecutor's_office
## 15 14
## national_weather_service three_years_ago
## 11 10
## world_war_ii u.s_district_court
## 9 9
## last_three_years st_louis_county
## 9 9
## past_two_years cents_per_share
## 9 9
## new_york_times securities_exchange_commission
## 8 8
## superior_court_judge u.s_supreme_court
## 8 8
## chief_executive_officer u.s_district_judge
## 7 7
## blog High frequency words
US_blogs_tokens<- tokens(blogs_sample,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")
US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1) ## unigram
US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2) ## bigram
US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3) ## tiigram
US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(US_blogs_unigram.dfm, 20) # 20 top US blogs Unigram words
## one can just like time get people know now back
## 1875 1482 1471 1416 1291 1055 957 939 879 837
## also us even make new really well much first see
## 830 811 797 780 764 759 751 751 747 729
topfeatures(US_blogs_bigram.dfm, 20) # 20 top US blogs Bigram words
## new_york can_see right_now last_year years_ago
## 80 78 75 75 73
## make_sure first_time little_bit one_thing feel_like
## 64 61 54 54 54
## even_though one_day last_night last_week many_people
## 52 52 52 51 49
## high_school long_time every_day united_states let_know
## 47 45 45 39 39
topfeatures(US_blogs_trigram.dfm, 20) # 20 top US blogs Trigram words
## couple_weeks_ago new_york_times new_york_city
## 11 10 9
## look_forward_seeing happy_new_year world_war_ii
## 8 7 7
## love_love_love incorporated_item_pp mummy_mummy_mummy
## 6 6 6
## spend_much_time just_little_bit two_weeks_ago
## 5 5 5
## spend_lot_time dream_come_true one_favorite_things
## 5 5 5
## four_years_ago ghost_towns_oklahoma three_years_old
## 5 5 4
## long_time_now last_two_years
## 4 4
## twitter Ngram words
twitter_tokens<- tokens(twitter_sample,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
twitter_tokens <- tokens_tolower(twitter_tokens)
twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")
twitter_unigram <- tokens_ngrams(twitter_tokens, n=1) ## unigram
twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
twitter_bigram <- tokens_ngrams(twitter_tokens, n=2) ## bigram
twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
twitter_trigram <- tokens_ngrams(twitter_tokens, n=3) ## trigram
twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(twitter_unigram.dfm, 20) # 20 top Unigram words
## just like get love good thanks rt day can know
## 2297 1824 1630 1601 1546 1434 1425 1344 1341 1259
## now one u time go great today lol new see
## 1205 1193 1181 1121 1108 1103 1080 1043 1030 958
topfeatures(twitter_bigram.dfm, 20) # 20 top Bigram words
## right_now last_night happy_birthday looking_forward
## 237 171 128 126
## feel_like just_got good_morning good_luck
## 106 105 102 101
## thanks_follow follow_back looks_like let_know
## 99 86 85 83
## can_get thanks_rt thanks_much next_week
## 80 72 72 70
## make_sure first_time great_day please_follow
## 67 66 64 62
topfeatures(twitter_trigram.dfm, 20) # 20 top Trigram words
## happy_new_year happy_mother's_day let_us_know
## 30 30 28
## happy_mothers_day cinco_de_mayo show_last_night
## 16 15 13
## looking_forward_seeing cant_wait_see just_got_back
## 13 13 11
## ralph_waldo_emerson st_patrick's_day please_follow_back
## 11 11 10
## just_got_done keep_good_work thanks_following_us
## 10 10 9
## looking_forward_next just_finished_mi finished_mi_run
## 9 9 9
## ever_ever_ever go_go_go
## 9 8
I have gone through the multiple literatures and youtube vidios on Text mining. Learned almost everything new specifically “quanteda” library. How text data set will get exploded with different ngrams and Bag of words. Looks like quanteda library is useful in generating the text analytics. Which very fast compare to TM library.