Quanteda PackageThe goal of this project is to display that we’ve become familiar with the data and that we are on track to create our prediction algorithm. This report (to be submitted on R Pubs (http://rpubs.com/)) explains our exploratory analysis and our goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data we’ve identified and briefly summarize our plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. We will make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:
library(R.utils) # various R programming facilities
library(ggplot2) # ggplot plotting package
library(dplyr) # data manipulation utilities
library(quanteda) # quantitative text analysis package
# library(ngram) # R package for constructing n-grams (“tokenizing”), as well as generating new text based on the n-gram structure of a given text input (“babbling”)
# library(slam) # sparse lightweight arrays and matrices
# library(tidytext) # tidy text mining package
# library(textmineR) # functions for text mining and topic modeling
# library(tau) # text analysis utilities
# library(stringi) # character string processing package
# library(readtext)
# library(tm) # text mining package
# library(RWeka) # a collection of machine learning algorithms for data mining tasks written in Java
# library(SnowballC) # a R interface to the C 'libstemmer' library that implements Porter's word stemming algorithm for collapsing words to a common root to aid comparison of vocabulary.
blogs <- "final/en_US/en_US.blogs.txt"
news <- "final/en_US/en_US.news.txt"
twitter <- "final/en_US/en_US.twitter.txt"
en_US.blogs.txtblog_line <- readLines(blogs,encoding="UTF-8", skipNul = TRUE)
blog_corpus <- corpus(blog_line)
num_blog_entries <- sapply(blogs,countLines) # count number of entries in file
print(paste("Number of entries - blogs: ",num_blog_entries))
## [1] "Number of entries - blogs: 899288"
print(paste("Number of sentences - blogs: ",sum(nsentence(blog_corpus))))
## [1] "Number of sentences - blogs: 2362935"
print(paste("Number of tokens - blogs: ",sum(ntoken(blog_corpus,remove_punct=TRUE))))
## [1] "Number of tokens - blogs: 37339814"
en_US.news.txtnews_line <- readLines(news,encoding="UTF-8", skipNul = TRUE)
news_corpus <- corpus(news_line)
num_news_entries <- sapply(news,countLines) # count number of lines in file
print(paste("Number of entries - news: ",num_news_entries))
## [1] "Number of entries - news: 1010242"
print(paste("Number of sentences - news: ",sum(nsentence(news_corpus))))
## [1] "Number of sentences - news: 1992553"
print(paste("Number of tokens - news: ",sum(ntoken(news_corpus,remove_punct=TRUE))))
## [1] "Number of tokens - news: 34376642"
en_US.twitter.txttwitter_line <- readLines(twitter,encoding="UTF-8", skipNul = TRUE)
twitter_corpus <- corpus(twitter_line)
num_twitter_entries <- sapply(twitter,countLines) # count number of lines in file
print(paste("Number of entries - twitter: ",num_twitter_entries))
## [1] "Number of entries - twitter: 2360148"
print(paste("Number of sentences - twitter: ",sum(nsentence(twitter_corpus))))
## [1] "Number of sentences - twitter: 3754216"
print(paste("Number of tokens - twitter: ",sum(ntoken(twitter_corpus,remove_punct=TRUE))))
## [1] "Number of tokens - twitter: 30162656"
.txt filesset.seed(42)
data.sample <- c(blog_line[sample(1:length(blog_line),length(blog_line)*0.1)],
news_line[sample(1:length(news_line),length(news_line)*0.1)],
twitter_line[sample(1:length(twitter_line),length(twitter_line)*0.1)] )
blog_line,news_line and twitter_line files and associated corpora to free up memory since we don’t need them anymore.rm(list=c("blog_line","news_line","twitter_line","blog_corpus","news_corpus","twitter_corpus"))
quanteda package…sample_corpus <- corpus(data.sample)
sample_tokens <- tokens(sample_corpus,what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
split_hyphens = FALSE,
include_docvars = TRUE,
padding = FALSE)
sample_tokens <- tokens_tolower(sample_tokens)
sample_tokens <- tokens_wordstem(sample_tokens,
language = quanteda_options("language_stemmer"))
sample_tokens <- tokens_select(sample_tokens, pattern = stopwords("en"), selection = "remove")
n_grams_plot <- function(n, data) {
# use `reorder` function to sort the words in decreasing frequency
ggplot(data, aes(x=reorder(feature,-frequency,sum), y=frequency)) +
geom_bar(stat="Identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab(paste(n,"- grams")) +
ylab("Frequency") + ggtitle(paste(n,"- gram Frequencies")) +
theme(plot.title = element_text(hjust = 0.5))
}
dfm_sample_tokens <- dfm(sample_tokens)
one_gram_freq <- textstat_frequency(dfm_sample_tokens)
head(one_gram_freq, 15)
## feature frequency rank docfreq group
## 1 one 30938 1 27084 all
## 2 said 30764 2 27951 all
## 3 just 30693 3 28470 all
## 4 like 30349 4 27076 all
## 5 get 29908 5 27045 all
## 6 go 26736 6 24096 all
## 7 time 25987 7 23174 all
## 8 can 25089 8 22226 all
## 9 day 22450 9 20099 all
## 10 year 21639 10 18841 all
## 11 love 20412 11 18122 all
## 12 make 20410 12 18688 all
## 13 new 19413 13 17459 all
## 14 good 18227 14 16928 all
## 15 know 18189 15 16471 all
n_grams_plot(1,head(one_gram_freq,15))
two_grams <- dfm(tokens_ngrams(sample_tokens, n = 2))
two_gram_freq <- textstat_frequency(two_grams)
head(two_gram_freq,15)
## feature frequency rank docfreq group
## 1 right_now 2463 1 2424 all
## 2 last_year 2284 2 2223 all
## 3 look_like 2206 3 2146 all
## 4 new_york 2011 4 1885 all
## 5 feel_like 1752 5 1702 all
## 6 look_forward 1736 6 1706 all
## 7 last_night 1683 7 1660 all
## 8 year_ago 1654 8 1619 all
## 9 high_school 1477 9 1373 all
## 10 last_week 1420 10 1398 all
## 11 thank_follow 1247 11 1247 all
## 12 first_time 1206 12 1180 all
## 13 make_sure 1194 13 1158 all
## 14 can_get 1106 14 1096 all
## 15 st_loui 956 15 865 all
n_grams_plot(2,head(two_gram_freq,15))
three_grams <- dfm(tokens_ngrams(sample_tokens, n = 3))
three_gram_freq <- textstat_frequency(three_grams)
head(three_gram_freq,15)
## feature frequency rank docfreq group
## 1 happi_mother_day 375 1 373 all
## 2 new_york_citi 275 2 272 all
## 3 let_us_know 255 3 255 all
## 4 happi_new_year 214 4 214 all
## 5 presid_barack_obama 188 5 188 all
## 6 look_forward_see 165 6 164 all
## 7 two_year_ago 162 7 161 all
## 8 amp_amp_amp 157 8 1 all
## 9 new_york_time 154 9 151 all
## 10 cinco_de_mayo 141 10 136 all
## 11 world_war_ii 123 11 119 all
## 12 st_loui_counti 122 12 118 all
## 13 gov_chris_christi 115 13 114 all
## 14 dream_come_true 107 14 107 all
## 15 first_time_sinc 102 15 102 all
n_grams_plot(3,head(three_gram_freq,15))
four_grams <- dfm(tokens_ngrams(sample_tokens, n = 4))
four_gram_freq <- textstat_frequency(four_grams)
head(four_gram_freq,15)
## feature frequency rank docfreq group
## 1 amp_amp_amp_gt 59 1 1 all
## 2 happi_mother_day_mom 52 2 52 all
## 3 amp_amp_amp_amp 52 2 1 all
## 4 martin_luther_king_jr 51 4 50 all
## 5 amazon_servic_llc_amazon 48 5 24 all
## 6 servic_llc_amazon_eu 48 5 24 all
## 7 amp_amp_amp_lt 46 7 1 all
## 8 amp_amp_lt_span 46 7 1 all
## 9 thank_follow_look_forward 42 9 42 all
## 10 g_fat_g_satur 40 10 40 all
## 11 happi_st_patrick_day 36 11 36 all
## 12 g_protein_g_carbohydr 33 12 33 all
## 13 s_o_new_follow 32 13 32 all
## 14 dure_world_war_ii 32 13 32 all
## 15 make_dream_come_true 31 15 31 all
n_grams_plot(4,head(four_gram_freq,15))
five_grams <- dfm(tokens_ngrams(sample_tokens, n = 5))
five_gram_freq <- textstat_frequency(five_grams)
head(five_gram_freq,15)
## feature frequency rank docfreq group
## 1 amazon_servic_llc_amazon_eu 48 1 24 all
## 2 amp_amp_amp_lt_span 46 2 1 all
## 3 amp_amp_amp_amp_amp 41 3 1 all
## 4 g_protein_g_carbohydr_g 31 4 31 all
## 5 style_background_none_repeat_scroll 30 5 1 all
## 6 background_none_repeat_scroll_yellow 30 5 1 all
## 7 none_repeat_scroll_yellow_class 29 7 1 all
## 8 span_style_background_none_repeat 29 7 1 all
## 9 span_amp_amp_amp_gt 28 9 1 all
## 10 content_provid_subject_chang_remov 25 10 25 all
## 11 provid_subject_chang_remov_ani 25 10 25 all
## 12 subject_chang_remov_ani_time 25 10 25 all
## 13 particip_amazon_servic_llc_amazon 24 13 24 all
## 14 servic_llc_amazon_eu_associ 24 13 24 all
## 15 llc_amazon_eu_associ_programm 24 13 24 all
n_grams_plot(5,head(five_gram_freq,15))