The goal here is to build your first simple model for the relationship between words. This is the first step in building a predictive text mining application. You will explore simple models and discover more complicated modeling techniques.
Tasks to accomplish
Build basic n-gram model - using the exploratory analysis you performed, build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words. Build a model to handle unseen n-grams - in some cases people will want to type a combination of words that does not appear in the corpora. Build a model to handle cases where a particular n-gram isn’t observed. Questions to consider
How can you efficiently store an n-gram model (think Markov Chains)? How can you use the knowledge about word frequencies to make your model smaller and more efficient? How many parameters do you need (i.e. how big is n in your n-gram model)? Can you think of simple ways to “smooth” the probabilities (think about giving all n-grams a non-zero probability even if they aren’t observed in the data) ? How do you evaluate whether your model is any good? How can you use backoff models to estimate the probability of unobserved n-grams?
#necessary library for the project
library(tm)
## Loading required package: NLP
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
#open connection to news data
connect <- file("en_US.news.txt", open="r")
News_Data <- readLines(connect)
## Warning in readLines(connect): incomplete final line found on
## 'en_US.news.txt'
close(connect)
#open connection to twitter data
connect <- file("en_US.twitter.txt", open="r")
Twitter_Data <- readLines(connect)
## Warning in readLines(connect): line 167155 appears to contain an embedded
## nul
## Warning in readLines(connect): line 268547 appears to contain an embedded
## nul
## Warning in readLines(connect): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(connect): line 1759032 appears to contain an embedded
## nul
close(connect)
#open connection to blog data
connect <- file("en_US.blogs.txt", open="r")
Blog_Data <- readLines(connect)
close(connect)
get_stat<- function(text_file, lines) {
f_size <- file.info(text_file)[1]/1024^2
nchars <- lapply(lines, nchar)
maxchars <- which.max(nchars)
word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}
News_Stat<- get_stat("en_US.news.txt", News_Data)
Twitter_Stat<- get_stat("en_US.twitter.txt", Twitter_Data)
Blog_Stat <- get_stat("en_US.blogs.txt", Blog_Data)
Summarry_All <- c(News_Stat, Blog_Stat,Twitter_Stat)
df <- data.frame(matrix(unlist(Summarry_All), nrow=3, byrow=T))
colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
print(df)
## Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1 en_US.news.txt 196.28 77259 14556 2643972
## 2 en_US.blogs.txt 200.42 899288 483415 37334441
## 3 en_US.twitter.txt 159.36 2360148 1484357 30373792
Generate_Corpus<- function(test_file) {
corpus_gen<- paste(test_file, collapse=" ")
corpus_gen <- VectorSource(corpus_gen)
corpus_gen <- Corpus(corpus_gen)
}
Clean_Corpus <- function(corpus_cln) {
corpus_cln <- tm_map(corpus_cln, removeNumbers)
corpus_cln <- tm_map(corpus_cln, content_transformer(tolower))
corpus_cln <- tm_map(corpus_cln, removeWords, stopwords("english"))
corpus_cln <- tm_map(corpus_cln, removePunctuation)
corpus_cln <- tm_map(corpus_cln, stripWhitespace)
return (corpus_cln)
}
Common_Words <- function (corpus_cln) {
term_sparse <- DocumentTermMatrix(corpus_cln)
term_matrix <- as.matrix(term_sparse)
words_common <- colSums(term_matrix)
words_common <- as.data.frame(sort(words_common, decreasing=TRUE))
words_common$word <- rownames(words_common)
colnames(words_common) <- c("Frequency","word")
return (words_common)
}
News_Data_Text<-sample(News_Data, round(0.1*length(News_Data)), replace = F)
News_Data_Corpus <- Generate_Corpus(News_Data_Text)
News_Data_Corpus <- Clean_Corpus(News_Data_Corpus)
## Warning in tm_map.SimpleCorpus(corpus_cln, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, stripWhitespace): transformation
## drops documents
Most_Popular_Word_News <- Common_Words(News_Data_Corpus)
Most_Popular_Word_SubN<- Most_Popular_Word_News[1:15,]
p<-ggplot(data=Most_Popular_Word_SubN, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Words") +labs(title = "Most Popular News Words") +theme(legend.title=element_blank()) + coord_flip()
Twitter_Data_Text<-sample(Twitter_Data, round(0.1*length(Twitter_Data)), replace = F)
Twitter_Data_Corpus <- Generate_Corpus(Twitter_Data_Text)
Twitter_Data_Corpus <- Clean_Corpus(Twitter_Data_Corpus)
## Warning in tm_map.SimpleCorpus(corpus_cln, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, stripWhitespace): transformation
## drops documents
Most_Popular_Word_Twit <- Common_Words(Twitter_Data_Corpus)
Most_Popular_Word_SubT<- Most_Popular_Word_Twit[1:15,]
p<-ggplot(data=Most_Popular_Word_SubT, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Words") +labs(title = "Most Popular News Words") +theme(legend.title=element_blank()) + coord_flip()
Blog_Data_Text<-sample(Twitter_Data, round(0.1*length(Blog_Data)), replace = F)
Blog_Data_Corpus <- Generate_Corpus(Blog_Data_Text)
Blog_Data_Corpus <- Clean_Corpus(Blog_Data_Corpus)
## Warning in tm_map.SimpleCorpus(corpus_cln, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, removeWords,
## stopwords("english")): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus_cln, stripWhitespace): transformation
## drops documents
Most_Popular_Word_Blog <- Common_Words(Blog_Data_Corpus)
Most_Popular_Word_SubB<- Most_Popular_Word_Blog[1:15,]
p<-ggplot(data=Most_Popular_Word_SubB, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Words") +labs(title = "Most Popular News Words") +theme(legend.title=element_blank()) + coord_flip()
wordcloud(Most_Popular_Word_News$word[1:100], Most_Popular_Word_News$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
wordcloud(Most_Popular_Word_Twit$word[1:100], Most_Popular_Word_Twit$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
wordcloud(Most_Popular_Word_Blog$word[1:100], Most_Popular_Word_Blog$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
News_Text_sub<-sample(News_Data, round(0.01*length(News_Data)), replace = F)
Token_News<- tokens(News_Text_sub,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
Token_News <- tokens_tolower(Token_News)
Token_News <- tokens_select(Token_News, stopwords(),selection ="remove")
Unigram_News <- tokens_ngrams(Token_News, n=1)
Unigram_News.dfm <- dfm(Unigram_News, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
Bigram_News <- tokens_ngrams(Token_News, n=2)
Bigram_News.dfm <- dfm(Bigram_News, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
Trigram_News <- tokens_ngrams(Token_News, n=3)
Trigram_News.dfm <- dfm(Trigram_News, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(Unigram_News.dfm, 20)
## said â one just can also new s people time
## 186 104 80 63 55 53 52 50 49 46
## two last first year state get game now years way
## 44 42 42 40 36 32 32 32 31 31
topfeatures(Bigram_News.dfm, 20)
## new_york last_week st_louis itâ_s didnâ_t
## 10 7 7 6 6
## even_though medical_center â_said thereâ_s â_œthe
## 6 6 5 5 5
## los_angeles united_states high_school last_season last_year
## 5 5 5 4 4
## can_get police_said two_years new_jersey â_œi
## 4 4 4 3 3
topfeatures(Trigram_News.dfm, 20)
## new_england_patriots womenâ_s_basketball
## 2 2
## delta_air_lines said_â_œi
## 2 2
## one_food_truck department_economic_development
## 2 2
## st_square_feet third_fourth_lines
## 2 2
## smoke_cedar_bark one_bin_ladenâ
## 2 2
## bin_ladenâ_s georgia_street_community
## 2 2
## donâ_t_want based_tac_air
## 2 2
## girls_athletics_programs generate_million_million
## 2 2
## congressional_term_limits â_œobviously_itâ
## 2 1
## œobviously_itâ_s itâ_s_lot
## 1 1
library(quanteda)
Twitter_Text_sub<-sample(Twitter_Data, round(0.01*length(Twitter_Data)), replace = F)
Token_Twitter<- tokens(Twitter_Text_sub,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
Token_Twitter <- tokens_tolower(Token_Twitter)
Token_Twitter <- tokens_select(Token_Twitter, stopwords(),selection ="remove")
Unigram_Twitter <- tokens_ngrams(Token_Twitter, n=1)
Unigram_Twitter.dfm <- dfm(Unigram_Twitter, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
Bigram_Twitter <- tokens_ngrams(Token_Twitter, n=2)
Bigram_Twitter.dfm <- dfm(Bigram_Twitter, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
Trigram_Twitter <- tokens_ngrams(Token_Twitter, n=3)
Trigram_Twitter.dfm <- dfm(Trigram_Twitter, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(Unigram_Twitter.dfm, 20)
## just like get love good day rt can thanks one
## 1553 1223 1146 1046 1001 953 924 921 885 878
## know now great u go time today lol new see
## 834 821 813 812 786 742 741 670 661 644
topfeatures(Bigram_Twitter.dfm, 20)
## â_œ right_now last_night looking_forward
## 215 135 114 101
## happy_birthday good_morning thanks_follow just_got
## 89 83 78 77
## looks_like follow_back let_know can_get
## 70 62 61 60
## good_luck great_day please_follow sounds_like
## 54 52 51 50
## ðÿ_ðÿ feel_like next_week one_day
## 47 46 45 41
topfeatures(Trigram_Twitter.dfm, 20)
## happy_mothers_day î_î_î let_us_know
## 27 24 23
## ðÿ_ðÿ_ðÿ happy_mother's_day looking_forward_seeing
## 15 14 13
## follow_follow_follow â_â_â happy_new_year
## 12 11 11
## gt_gt_gt just_got_done rt_â_œ
## 11 10 10
## dreamed_dreamed_dreamed louis_louis_louis cake_cake_cake
## 9 8 8
## love_love_love cinco_de_mayo get_work_done
## 7 7 7
## thanks_following_us ralph_waldo_emerson
## 7 7
library(quanteda)
Blog_Text_sub<-sample(Blog_Data, round(0.01*length(Blog_Data)), replace = F)
Token_Blog<- tokens(Blog_Text_sub,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
Token_Blog <- tokens_tolower(Token_Blog)
Token_Blog <- tokens_select(Token_Blog, stopwords(),selection ="remove")
Unigram_Blog <- tokens_ngrams(Token_Blog, n=1)
Unigram_Blog.dfm <- dfm(Unigram_Blog, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
Bigram_Blog <- tokens_ngrams(Token_Blog, n=2)
Bigram_Blog.dfm <- dfm(Bigram_Blog, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
Trigram_Blog <- tokens_ngrams(Token_Blog, n=3)
Trigram_Blog.dfm <- dfm(Trigram_Blog, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(Unigram_Blog.dfm, 20)
## â s one can like just t time get people
## 2243 1656 1231 996 994 989 913 886 737 610
## new know now also back day iâ make good see
## 577 571 539 534 530 518 517 504 504 500
topfeatures(Bigram_Blog.dfm, 20)
## itâ_s donâ_t iâ_m iâ_ve didnâ_t thatâ_s canâ_t
## 392 266 264 138 123 115 90
## youâ_re doesnâ_t thereâ_s isnâ_t â_œthe years_ago iâ_ll
## 72 70 65 65 65 60 57
## new_york iâ_d feel_like wasnâ_t heâ_s right_now
## 56 55 52 51 49 49
topfeatures(Trigram_Blog.dfm, 20)
## donâ_t_know iâ_m_sure iâ_m_going
## 30 29 18
## italian_people_italy donâ_t_think donâ_t_need
## 17 14 13
## donâ_t_want itâ_s_just itâ_s_like
## 13 13 12
## didnâ_t_know donâ_t_really donâ_t_get
## 11 11 11
## ð_ð_ð iâ_m_just â_œiâ_m
## 10 10 10
## â_œitâ_s didnâ_t_want iâ_d_like
## 10 10 10
## iâ_ve_never donâ_t_go
## 10 8