210,160,014 bytes en_US.blogs.txt
205,811,889 bytes en_US.news.txt
167,105,338 bytes en_US.twitter.txt
This page will show:
some basic summaries of the 3 files: Word counts, line counts and basic data tables
Histograms to illustrate features of the data
f <- file("en_US.news.txt", open="rb")
read_news <- readLines(f)
close(f)
f <- file("en_US.blogs.txt", open="rb")
read_blogs <- readLines(f)
close(f)
f <- file("en_US.twitter.txt", open="rb")
read_twitter <- readLines(f)
## Warning in readLines(f): line 167155 appears to contain an embedded nul
## Warning in readLines(f): line 268547 appears to contain an embedded nul
## Warning in readLines(f): line 1274086 appears to contain an embedded nul
## Warning in readLines(f): line 1759032 appears to contain an embedded nul
close(f)
print('')
## [1] ""
print(paste('The News dataset has ',length(read_news),' lines, with the longest line being ',max(nchar(read_news))))
## [1] "The News dataset has 1010242 lines, with the longest line being 11384"
print(paste('The Blogs dataset has ',length(read_blogs),' lines, with the longest line being ',max(nchar(read_blogs))))
## [1] "The Blogs dataset has 899288 lines, with the longest line being 40835"
print(paste('The Twitter dataset has ',length(read_twitter),' lines, with the longest line being ',max(nchar(read_twitter))))
## [1] "The Twitter dataset has 2360148 lines, with the longest line being 213"
Tweets have shorter sentences. No surprise. And Blogs tend to contain more run-on sentences than News. No surprise either.
Just go ahead and create some bi, tri and quadrigrams and word counts around each. These datasets contain a bunch of junk words and phrasings. Some advice seems to be to clean out things like punctuation, curse words (though I think that crap’s useful). Really the most useful site I found was https://www.tidytextmining.com/ngrams.html
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(tibble)
## Loading required package: tibble
df_news <- tibble(text = read_news)
df_blog <- tibble(text = read_blogs)
df_twitter <- tibble(text = read_twitter)
#this is just 1-grams
tokens_news <- df_news %>% unnest_tokens(word,text)
count_news <- count(tokens_news,word,sort = TRUE)
tokens_blog <- df_blog %>% unnest_tokens(word,text)
count_blog <- count(tokens_blog,word,sort = TRUE)
tokens_twitter <- df_twitter %>% unnest_tokens(word,text)
count_twitter <- count(tokens_twitter,word,sort = TRUE)
library(knitr)
count_all <- merge(count_news,count_blog, by = "word")
count_all <- merge(count_all,count_twitter, by = "word")
count_all <- count_all[order(count_all$n.x,decreasing = TRUE),]
colnames(count_all)[2]<- "News Count"
colnames(count_all)[3]<- "Blog Count"
colnames(count_all)[4]<- "Twitter Count"
kable(count_all[1:20,],caption = 'Unigram count')
| word | News Count | Blog Count | Twitter Count | |
|---|---|---|---|---|
| 86340 | the | 1971816 | 1854232 | 936888 |
| 87278 | to | 905909 | 1068623 | 788530 |
| 7143 | and | 889057 | 1093307 | 438495 |
| 4467 | a | 877651 | 899316 | 611266 |
| 62113 | of | 774451 | 876475 | 359611 |
| 43899 | in | 678766 | 597444 | 380282 |
| 34679 | for | 353730 | 363211 | 385282 |
| 86321 | that | 346201 | 459691 | 234560 |
| 45636 | is | 284077 | 431696 | 358708 |
| 62482 | on | 269764 | 275921 | 277955 |
| 94594 | with | 254703 | 286464 | 173445 |
| 75376 | said | 250403 | 36554 | 18124 |
| 92988 | was | 228907 | 278127 | 117372 |
| 40524 | he | 227740 | 144559 | 56393 |
| 45770 | it | 217546 | 400818 | 294511 |
| 8924 | at | 214022 | 172009 | 186733 |
| 8559 | as | 187380 | 223421 | 70872 |
| 41560 | his | 157610 | 109906 | 34159 |
| 43109 | i | 153714 | 769248 | 722977 |
| 10989 | be | 152777 | 208667 | 187746 |
df_news <- df_news[1:round(0.1*length(df_news$text)),]
df_blog <- df_blog[1:round(0.1*length(df_blog$text)),]
df_twitter <- df_twitter[1:round(0.1*length(df_twitter$text)),]
Focus on the news as an example. It is observed that as N increases so does the number of *-grams. This makes sense because the ngrams are formed from a sliding window. There is probably a formula that explains this.
print('Number of unique News words: ',length(count_news))
## [1] "Number of unique News words: "
print('Getting bigrams')
## [1] "Getting bigrams"
bigrams_news <- df_news%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_news <- bigrams_news %>% count(bigram, sort = TRUE)
print('Number of unique News bigrams: ',length(bi_count_news))
## [1] "Number of unique News bigrams: "
print('Getting trigrams')
## [1] "Getting trigrams"
trigrams_news <- df_news %>% unnest_tokens(trigram,text,token = "ngrams", n = 3)
print('Getting counts of trigrams')
## [1] "Getting counts of trigrams"
tri_count_news <- trigrams_news %>% count(trigram, sort = TRUE)
print('Getting counts of quadgrams')
## [1] "Getting counts of quadgrams"
quadgrams_news <- df_news %>% unnest_tokens(quadgram,text,token = "ngrams", n = 4)
print('Getting counts of quadgrams')
## [1] "Getting counts of quadgrams"
quad_count_news <- quadgrams_news %>% count(quadgram, sort = TRUE)
print('Number of unique blog words: ',length(count_blog))
## [1] "Number of unique blog words: "
print('Getting bigrams')
## [1] "Getting bigrams"
bigrams_blog <- df_blog%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_blog <- bigrams_blog %>% count(bigram, sort = TRUE)
print('Number of unique blog bigrams: ',length(bi_count_blog))
## [1] "Number of unique blog bigrams: "
print('Number of unique twitter words: ',length(count_twitter))
## [1] "Number of unique twitter words: "
print('Getting bigrams')
## [1] "Getting bigrams"
bigrams_twitter <- df_twitter%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_twitter <- bigrams_twitter %>% count(bigram, sort = TRUE)
print('Number of unique twitter bigrams: ',length(bi_count_twitter))
## [1] "Number of unique twitter bigrams: "
#https://stackoverflow.com/questions/28929243/with-ggplot2-what-code-creates-bars-made-of-individual-words-and-their-count
library(ggplot2)
library(ggthemes)
p <- ggplot(bi_count_news[1:20,], aes(bigram, n)) +
geom_bar(stat="identity", fill=hcl(195,100,65)) +
geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
theme_tufte(base_size=10)
#+ theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p<- p + coord_flip() + ggtitle('NEWS')
p
p <- ggplot(bi_count_blog[1:20,], aes(bigram, n)) +
geom_bar(stat="identity", fill=hcl(195,100,65)) +
geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
theme_tufte(base_size=10)
#+ theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p<- p + coord_flip() + ggtitle('BLOG')
p
p <- ggplot(bi_count_twitter[1:20,], aes(bigram, n)) +
geom_bar(stat="identity", fill=hcl(195,100,65)) +
geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
theme_tufte(base_size=10)
#+ theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p<- p + coord_flip() + ggtitle('TWITTER')
p
There are many words in the corpus that only show up once. What happens if you just drop those words, replacing them with a ‘.’ Are things still obnoxiously large?
ANSWER: I won’t find out because the 3 methods I try to remove single_word counts seem to take forever.
#single_words <- count_news[count_news$n == 1,]$word
#print('Number of single-word counts in the News: ',length(single_words))
#library(stringr)
# read_news_drop <- str_replace_all(read_news,single_words,'.')
# library(qdap)
# read_news_drop <- mgsub(single_words,'.',read_news) #this froze
# read_news_drop <- read_news
# require(progress)
# pb <- progress_bar$new(total = length(single_words))
# for(sw in single_words){
# pb$tick()
# read_news_drop <- gsub(sw,'.',read_news_drop)
# }
https://stats.stackexchange.com/questions/23429/at-what-n-do-n-grams-become-counterproductive
The informative answer to this stackexchange question mentions the idea of “Perplexity”, which should be a good read.
https://en.wikipedia.org/wiki/Perplexity