blogs<-file("en_US.blogs.txt","r")
blogs_lines<-readLines(blogs)
close(blogs)
news<-file("en_US.news.txt","r")
news_lines<-readLines(news)
## Warning in readLines(news): incomplete final line found on 'en_US.news.txt'
close(news)
twitter<-file("en_US.twitter.txt","r")
twitter_lines<-readLines(twitter)
## Warning in readLines(twitter): line 167155 appears to contain an embedded nul
## Warning in readLines(twitter): line 268547 appears to contain an embedded nul
## Warning in readLines(twitter): line 1274086 appears to contain an embedded nul
## Warning in readLines(twitter): line 1759032 appears to contain an embedded nul
close(twitter)
CleanR <- function(x) {
sampleTxt <- tolower(x)
sampleTxt <- gsub("([^[:space:]]*)(@|#|http://|https://)([^[:space:]]*)", " ", sampleTxt)
sampleTxt <- iconv(sampleTxt, "latin1", "ASCII", sub="")
sampleTxt <- gsub("[[:punct:]]", "", sampleTxt)
sampleTxt <- gsub("[[:digit:]]","",sampleTxt)
sampleTxt <- gsub("'","",sampleTxt)
sampleTxt <- sampleTxt[stri_count_words(sampleTxt, "\\s+")>2]
}
blogs_lines <- CleanR(blogs_lines)
news_lines <- CleanR(news_lines)
twitter_lines <- CleanR(twitter_lines)
blogs_lines <- sample(blogs_lines, length(blogs_lines) * 0.5)
news_lines <- sample(news_lines, length(news_lines) * 0.8)
twitter_lines <- sample(blogs_lines, length(blogs_lines) * 0.5)
TriGrams
allFiles <- c(blogs_lines,news_lines, twitter_lines)
corpus <- corpus(allFiles)
remove(blogs_lines, news_lines, twitter_lines)
token <- tokens(corpus, remove_punct = TRUE)
remove(corpus)
toks_nostop <- tokens_select(token, pattern = stopwords("en"), selection = "remove")
remove(token)
ngramTri <- tokens_ngrams(toks_nostop, n = 3)
topNgramTRI <- topfeatures(dfm(ngramTri), 500000)
saveRDS(topNgramTRI, "topNgramTRI.rds")
Read In Top TriGrams
readIn<- function() { topNgramTRI <- readRDS("topNgramTRI.rds")}
get_word3 <- function(x){
phrase <- tolower(x)
formated <- word(phrase, -2, -1)
print(formated)
formated <- gsub("'","",formated)
formated <- gsub(" ", "_" , formated)
formated <- paste("^", formated, ".+", sep = "")
searchTerms <- names(topNgramTRI)[grepl(formated, names(topNgramTRI))]
gsub(".+_.+_", "", searchTerms)
}