rm(list = ls())
library(quanteda)
library(data.table)
library(dplyr)
library(stringr)
# getwd()
setwd("C:/Users/pruebas/Documents/Capstone_Katz_Back_Off")
# A character vector, each element is a line
blog_line <- readLines("final/en_US/en_US.blogs.txt",encoding="UTF-8", skipNul = TRUE)
news_line <- readLines("final/en_US/en_US.news.txt",encoding="UTF-8", skipNul = TRUE)
twitter_line <- readLines("final/en_US/en_US.twitter.txt",encoding="UTF-8", skipNul = TRUE)
set.seed(1977)
samplecorpus <- c(blog_line[sample(1:length(blog_line),length(blog_line)*0.05)],news_line[sample(1:length(news_line),length(news_line)*0.05)],twitter_line[sample(1:length(twitter_line),length(twitter_line)*0.05)])
rm(list=c("blog_line","news_line","twitter_line"))
We will use the quanteda package to construct the n-gram tables.
sample_corpus <- corpus(samplecorpus)
sample_corpus_tokens <- tokens(sample_corpus,what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
split_hyphens = FALSE,
include_docvars = TRUE,
padding = FALSE)
sample_corpus_tokens <- tokens_tolower(sample_corpus_tokens)
sample_corpus_tokens <- tokens_wordstem(sample_corpus_tokens, language = quanteda_options("language_stemmer"))
sample_corpus_tokens <- tokens_select(sample_corpus_tokens, pattern = stopwords("en"), selection = "remove")
rm(list=c("samplecorpus","sample_corpus"))
dfm_sample_corpus_tokens <- dfm(sample_corpus_tokens)
unigrams_freq <- textstat_frequency(dfm_sample_corpus_tokens) # unigram frequency
unigrs <- subset(unigrams_freq,select=c(feature,frequency))
names(unigrs) <- c("ngram","freq")
unigrs <- as.data.table(unigrs)
fwrite(unigrs,"unigrs.csv")
bigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 2))
bigrams_freq <- textstat_frequency(bigrams) # bigram frequency
bigrs <- subset(bigrams_freq,select=c(feature,frequency))
names(bigrs) <- c("ngram","freq")
bigrs <- as.data.table(bigrs)
fwrite(bigrs,"bigrs.csv")
trigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 3))
trigrams_freq <- textstat_frequency(trigrams) # trigram frequency
trigrs <- subset(trigrams_freq,select=c(feature,frequency))
names(trigrs) <- c("ngram","freq")
trigrs <- as.data.table(trigrs)
fwrite(trigrs,"trigrs.csv")
head(unigrs);head(bigrs);head(trigrs)
## ngram freq
## 1: just 12764
## 2: get 12473
## 3: like 12299
## 4: one 11425
## 5: go 10789
## 6: time 9818
## ngram freq
## 1: right_now 1067
## 2: look_like 932
## 3: feel_like 776
## 4: last_night 768
## 5: look_forward 732
## 6: thank_follow 600
## ngram freq
## 1: happi_mother_day 194
## 2: let_us_know 123
## 3: happi_new_year 106
## 4: look_forward_see 81
## 5: amp_amp_amp 74
## 6: new_york_citi 62
rm(list=c("dfm_sample_corpus_tokens","sample_corpus_tokens","unigrams_freq","bigrams_freq","trigrams_freq","bigrams","trigrams"))