Capstone W2 Assignment

The dataset

210,160,014 bytes en_US.blogs.txt

205,811,889 bytes en_US.news.txt

167,105,338 bytes en_US.twitter.txt

This page will show:

some basic summaries of the 3 files: Word counts, line counts and basic data tables
Histograms to illustrate features of the data

f <- file("en_US.news.txt", open="rb")
read_news <- readLines(f) 
close(f)
f <- file("en_US.blogs.txt", open="rb")
read_blogs <- readLines(f) 
close(f)
f <- file("en_US.twitter.txt", open="rb")
read_twitter <- readLines(f)

## Warning in readLines(f): line 167155 appears to contain an embedded nul

## Warning in readLines(f): line 268547 appears to contain an embedded nul

## Warning in readLines(f): line 1274086 appears to contain an embedded nul

## Warning in readLines(f): line 1759032 appears to contain an embedded nul

close(f)
print('')

## [1] ""

print(paste('The News dataset has ',length(read_news),' lines, with the longest line being ',max(nchar(read_news))))

## [1] "The News dataset has  1010242  lines, with the longest line being  11384"

print(paste('The Blogs dataset has ',length(read_blogs),' lines, with the longest line being ',max(nchar(read_blogs))))

## [1] "The Blogs dataset has  899288  lines, with the longest line being  40835"

print(paste('The Twitter dataset has ',length(read_twitter),' lines, with the longest line being ',max(nchar(read_twitter))))

## [1] "The Twitter dataset has  2360148  lines, with the longest line being  213"

Word Counts

Tweets have shorter sentences. No surprise. And Blogs tend to contain more run-on sentences than News. No surprise either.

Just go ahead and create some bi, tri and quadrigrams and word counts around each. These datasets contain a bunch of junk words and phrasings. Some advice seems to be to clean out things like punctuation, curse words (though I think that crap’s useful). Really the most useful site I found was https://www.tidytextmining.com/ngrams.html

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tidytext)

## Loading required package: tidytext

require(tibble)

## Loading required package: tibble

df_news <- tibble(text = read_news)
df_blog <- tibble(text = read_blogs)
df_twitter <- tibble(text = read_twitter)

#this is just 1-grams
tokens_news <- df_news %>% unnest_tokens(word,text)
count_news <- count(tokens_news,word,sort = TRUE)

tokens_blog <- df_blog %>% unnest_tokens(word,text)
count_blog <- count(tokens_blog,word,sort = TRUE)

tokens_twitter <- df_twitter %>% unnest_tokens(word,text)
count_twitter <- count(tokens_twitter,word,sort = TRUE)

Unigram Counts

library(knitr)
count_all <- merge(count_news,count_blog, by = "word")
count_all <- merge(count_all,count_twitter, by = "word")

count_all <- count_all[order(count_all$n.x,decreasing = TRUE),]
colnames(count_all)[2]<- "News Count"
colnames(count_all)[3]<- "Blog Count"
colnames(count_all)[4]<- "Twitter Count"
kable(count_all[1:20,],caption = 'Unigram count')

Unigram count
	word	News Count	Blog Count	Twitter Count
86340	the	1971816	1854232	936888
87278	to	905909	1068623	788530
7143	and	889057	1093307	438495
4467	a	877651	899316	611266
62113	of	774451	876475	359611
43899	in	678766	597444	380282
34679	for	353730	363211	385282
86321	that	346201	459691	234560
45636	is	284077	431696	358708
62482	on	269764	275921	277955
94594	with	254703	286464	173445
75376	said	250403	36554	18124
92988	was	228907	278127	117372
40524	he	227740	144559	56393
45770	it	217546	400818	294511
8924	at	214022	172009	186733
8559	as	187380	223421	70872
41560	his	157610	109906	34159
43109	i	153714	769248	722977
10989	be	152777	208667	187746

SUBSAMPLE B/C MEMORY is overwhelmed

df_news <- df_news[1:round(0.1*length(df_news$text)),]
df_blog <- df_blog[1:round(0.1*length(df_blog$text)),]
df_twitter <- df_twitter[1:round(0.1*length(df_twitter$text)),]

N>1gram Counts

Focus on the news as an example. It is observed that as N increases so does the number of *-grams. This makes sense because the ngrams are formed from a sliding window. There is probably a formula that explains this.

print('Number of unique News words: ',length(count_news))

## [1] "Number of unique News words: "

print('Getting bigrams')

## [1] "Getting bigrams"

bigrams_news <- df_news%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_news <- bigrams_news %>% count(bigram, sort = TRUE)
print('Number of unique News bigrams: ',length(bi_count_news))

## [1] "Number of unique News bigrams: "

print('Getting trigrams')

## [1] "Getting trigrams"

trigrams_news <- df_news %>% unnest_tokens(trigram,text,token = "ngrams", n = 3)
print('Getting counts of trigrams')

## [1] "Getting counts of trigrams"

tri_count_news <- trigrams_news %>% count(trigram, sort = TRUE)

print('Getting counts of quadgrams')

## [1] "Getting counts of quadgrams"

quadgrams_news <- df_news %>% unnest_tokens(quadgram,text,token = "ngrams", n = 4)
print('Getting counts of quadgrams')

## [1] "Getting counts of quadgrams"

quad_count_news <- quadgrams_news %>% count(quadgram, sort = TRUE)


print('Number of unique blog words: ',length(count_blog))

## [1] "Number of unique blog words: "

print('Getting bigrams')

## [1] "Getting bigrams"

bigrams_blog <- df_blog%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_blog <- bigrams_blog %>% count(bigram, sort = TRUE)
print('Number of unique blog bigrams: ',length(bi_count_blog))

## [1] "Number of unique blog bigrams: "

print('Number of unique twitter words: ',length(count_twitter))

## [1] "Number of unique twitter words: "

print('Getting bigrams')

## [1] "Getting bigrams"

bigrams_twitter <- df_twitter%>% unnest_tokens(bigram,text,token = "ngrams", n = 2)
bi_count_twitter <- bigrams_twitter %>% count(bigram, sort = TRUE)
print('Number of unique twitter bigrams: ',length(bi_count_twitter))

## [1] "Number of unique twitter bigrams: "

#https://stackoverflow.com/questions/28929243/with-ggplot2-what-code-creates-bars-made-of-individual-words-and-their-count
library(ggplot2)
library(ggthemes)
p <- ggplot(bi_count_news[1:20,], aes(bigram, n)) +
  geom_bar(stat="identity", fill=hcl(195,100,65)) +
  geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
  theme_tufte(base_size=10) 

#+   theme(axis.text.x=element_blank(),  axis.ticks.x=element_blank())

p<- p + coord_flip() + ggtitle('NEWS')
p

p <- ggplot(bi_count_blog[1:20,], aes(bigram, n)) +
  geom_bar(stat="identity", fill=hcl(195,100,65)) +
  geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
  theme_tufte(base_size=10) 

#+   theme(axis.text.x=element_blank(),  axis.ticks.x=element_blank())

p<- p + coord_flip() + ggtitle('BLOG')
p

p <- ggplot(bi_count_twitter[1:20,], aes(bigram, n)) +
  geom_bar(stat="identity", fill=hcl(195,100,65)) +
  geom_text(aes(label=bigram, y=n*0.5), colour="white", size=5) +
  theme_tufte(base_size=10) 

#+   theme(axis.text.x=element_blank(),  axis.ticks.x=element_blank())

p<- p + coord_flip() + ggtitle('TWITTER')
p

What happens when you reduce the word count prior to bigram?

There are many words in the corpus that only show up once. What happens if you just drop those words, replacing them with a ‘.’ Are things still obnoxiously large?

ANSWER: I won’t find out because the 3 methods I try to remove single_word counts seem to take forever.

#single_words <- count_news[count_news$n == 1,]$word
#print('Number of single-word counts in the News: ',length(single_words))
#library(stringr)
# read_news_drop <- str_replace_all(read_news,single_words,'.')

# library(qdap)
# read_news_drop <- mgsub(single_words,'.',read_news)  #this froze
# read_news_drop <- read_news  
# require(progress)
# pb <- progress_bar$new(total = length(single_words))
# for(sw in single_words){
#         pb$tick()
#   read_news_drop <- gsub(sw,'.',read_news_drop)
# }

At what point does more N add less value?

https://stats.stackexchange.com/questions/23429/at-what-n-do-n-grams-become-counterproductive

The informative answer to this stackexchange question mentions the idea of “Perplexity”, which should be a good read.
https://en.wikipedia.org/wiki/Perplexity