The dataset

210,160,014 bytes en_US.blogs.txt

205,811,889 bytes en_US.news.txt

167,105,338 bytes en_US.twitter.txt

This page will show:

  1. some basic summaries of the 3 files: Word counts, line counts and basic data tables

  2. Histograms to illustrate features of the data

f <- file("en_US.news.txt", open="rb")
read_news <- readLines(f) 
close(f)
f <- file("en_US.blogs.txt", open="rb")
read_blogs <- readLines(f) 
close(f)
f <- file("en_US.twitter.txt", open="rb")
read_twitter <- readLines(f) 
## Warning in readLines(f): line 167155 appears to contain an embedded nul
## Warning in readLines(f): line 268547 appears to contain an embedded nul
## Warning in readLines(f): line 1274086 appears to contain an embedded nul
## Warning in readLines(f): line 1759032 appears to contain an embedded nul
close(f)
print('')
## [1] ""
print(paste('The News dataset has ',length(read_news),' lines, with the longest line being ',max(nchar(read_news))))
## [1] "The News dataset has  1010242  lines, with the longest line being  11384"
print(paste('The Blogs dataset has ',length(read_blogs),' lines, with the longest line being ',max(nchar(read_blogs))))
## [1] "The Blogs dataset has  899288  lines, with the longest line being  40835"
print(paste('The Twitter dataset has ',length(read_twitter),' lines, with the longest line being ',max(nchar(read_twitter))))
## [1] "The Twitter dataset has  2360148  lines, with the longest line being  213"

Word Counts

Tweets have shorter sentences. No surprise. And Blogs tend to contain more run-on sentences than News. No surprise either.

Just go ahead and create some bi, tri and quadrigrams and word counts around each. These datasets contain a bunch of junk words and phrasings. Some advice seems to be to clean out things like punctuation, curse words (though I think that crap’s useful). Really the most useful site I found was https://www.tidytextmining.com/ngrams.html

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(tibble)
## Loading required package: tibble
df_news <- tibble(text = read_news)
df_blog <- tibble(text = read_blogs)
df_twitter <- tibble(text = read_twitter)

#this is just 1-grams
tokens_news <- df_news %>% unnest_tokens(word,text)
count_news <- count(tokens_news,word,sort = TRUE)

tokens_blog <- df_blog %>% unnest_tokens(word,text)
count_blog <- count(tokens_blog,word,sort = TRUE)

tokens_twitter <- df_twitter %>% unnest_tokens(word,text)
count_twitter <- count(tokens_twitter,word,sort = TRUE)

Unigram Counts

library(knitr)
count_all <- merge(count_news,count_blog, by = "word")
count_all <- merge(count_all,count_twitter, by = "word")

count_all <- count_all[order(count_all$n.x,decreasing = TRUE),]
colnames(count_all)[2]<- "News Count"
colnames(count_all)[3]<- "Blog Count"
colnames(count_all)[4]<- "Twitter Count"
kable(count_all[1:20,],caption = 'Unigram count')
Unigram count
word News Count Blog Count Twitter Count
86340 the 1971816 1854232 936888
87278 to 905909 1068623 788530
7143 and 889057 1093307 438495
4467 a 877651 899316 611266
62113 of 774451 876475 359611
43899 in 678766 597444 380282
34679 for 353730 363211 385282
86321 that 346201 459691 234560
45636 is 284077 431696 358708
62482 on 269764 275921 277955
94594 with 254703 286464 173445
75376 said 250403 36554 18124
92988 was 228907 278127 117372
40524 he 227740 144559 56393
45770 it 217546 400818 294511
8924 at 214022 172009 186733
8559 as 187380 223421 70872
41560 his 157610 109906 34159
43109 i 153714 769248 722977
10989 be 152777 208667 187746

SUBSAMPLE B/C MEMORY is overwhelmed

df_news <- df_news[1:round(0.1*length(df_news$text)),]
df_blog <- df_blog[1:round(0.1*length(df_blog$text)),]
df_twitter <- df_twitter[1:round(0.1*length(df_twitter$text)),]

N>1gram Counts

Focus on the news as an example. It is observed that as N increases so does the number of *-grams. This makes sense because the ngrams are formed from a sliding window. There is probably a formula that explains this.

print('Number of unique News words: ',length(count_news))
## [1] "Number of unique News words: "
print('Getting bigrams')
## [1] "Getting bigrams"
bigrams_news <- df_news%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_news <- bigrams_news %>% count(bigram, sort = TRUE)
print('Number of unique News bigrams: ',length(bi_count_news))
## [1] "Number of unique News bigrams: "
print('Getting trigrams')
## [1] "Getting trigrams"
trigrams_news <- df_news %>% unnest_tokens(trigram,text,token = "ngrams", n = 3)
print('Getting counts of trigrams')
## [1] "Getting counts of trigrams"
tri_count_news <- trigrams_news %>% count(trigram, sort = TRUE)

print('Getting counts of quadgrams')
## [1] "Getting counts of quadgrams"
quadgrams_news <- df_news %>% unnest_tokens(quadgram,text,token = "ngrams", n = 4)
print('Getting counts of quadgrams')
## [1] "Getting counts of quadgrams"
quad_count_news <- quadgrams_news %>% count(quadgram, sort = TRUE)


print('Number of unique blog words: ',length(count_blog))
## [1] "Number of unique blog words: "
print('Getting bigrams')
## [1] "Getting bigrams"
bigrams_blog <- df_blog%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_blog <- bigrams_blog %>% count(bigram, sort = TRUE)
print('Number of unique blog bigrams: ',length(bi_count_blog))
## [1] "Number of unique blog bigrams: "
print('Number of unique twitter words: ',length(count_twitter))
## [1] "Number of unique twitter words: "
print('Getting bigrams')
## [1] "Getting bigrams"
bigrams_twitter <- df_twitter%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_twitter <- bigrams_twitter %>% count(bigram, sort = TRUE)
print('Number of unique twitter bigrams: ',length(bi_count_twitter))
## [1] "Number of unique twitter bigrams: "
#https://stackoverflow.com/questions/28929243/with-ggplot2-what-code-creates-bars-made-of-individual-words-and-their-count
library(ggplot2)
library(ggthemes)
p <- ggplot(bi_count_news[1:20,], aes(bigram, n)) +
  geom_bar(stat="identity", fill=hcl(195,100,65)) +
  geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
  theme_tufte(base_size=10) 

#+   theme(axis.text.x=element_blank(),  axis.ticks.x=element_blank())

p<- p + coord_flip() + ggtitle('NEWS')
p

p <- ggplot(bi_count_blog[1:20,], aes(bigram, n)) +
  geom_bar(stat="identity", fill=hcl(195,100,65)) +
  geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
  theme_tufte(base_size=10) 

#+   theme(axis.text.x=element_blank(),  axis.ticks.x=element_blank())

p<- p + coord_flip() + ggtitle('BLOG')
p

p <- ggplot(bi_count_twitter[1:20,], aes(bigram, n)) +
  geom_bar(stat="identity", fill=hcl(195,100,65)) +
  geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
  theme_tufte(base_size=10) 

#+   theme(axis.text.x=element_blank(),  axis.ticks.x=element_blank())

p<- p + coord_flip() + ggtitle('TWITTER')
p

What happens when you reduce the word count prior to bigram?

There are many words in the corpus that only show up once. What happens if you just drop those words, replacing them with a ‘.’ Are things still obnoxiously large?

ANSWER: I won’t find out because the 3 methods I try to remove single_word counts seem to take forever.

#single_words <- count_news[count_news$n == 1,]$word
#print('Number of single-word counts in the News: ',length(single_words))
#library(stringr)
# read_news_drop <- str_replace_all(read_news,single_words,'.')

# library(qdap)
# read_news_drop <- mgsub(single_words,'.',read_news)  #this froze
# read_news_drop <- read_news  
# require(progress)
# pb <- progress_bar$new(total = length(single_words))
# for(sw in single_words){
#         pb$tick()
#   read_news_drop <- gsub(sw,'.',read_news_drop)
# }

At what point does more N add less value?

https://stats.stackexchange.com/questions/23429/at-what-n-do-n-grams-become-counterproductive

The informative answer to this stackexchange question mentions the idea of “Perplexity”, which should be a good read.
https://en.wikipedia.org/wiki/Perplexity

TAKE OUT THE STOP WORDS