In this report the data of the twitter, blogs and news documents are explored. The total amount of data was seen to be large so sub-sampling is necessary. Three n-grams seems to be the largest number of usefull ngrams
The environment is setup by loading the stringr, dplyr, tm, stringi and RWeka libraries.
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(stringi)
library(RWeka)
rm(list=ls())
Next data is loaded from the files
print(getwd())
## [1] "C:/Users/tdurieux/Documents/Repositories/coursera_repos/Coursera_hopkins_capstone/02_exploratory_data_analysis"
# TWITTER
con <- file("Data/en_US.twitter.txt", "r")
twitter <- readLines(con, skipNul = TRUE)
close(con)
# BLOGS
con <- file("Data/en_US.blogs.txt", "r")
blogs <- readLines(con, skipNul = TRUE)
close(con)
# NEWS
con <- file("Data/en_US.news.txt", "r")
news <- readLines(con, skipNul = TRUE)
close(con)
A first exploration shows that the twitter file contains over 2.3 million tweets, the blog almost 0.9 million blogs and the news file only 77 news articles. On the other hand the twitter file is also far shorter per item To make ngrams it seems better to use more of the news and blog files than the twitter file.
# Count lines per file
print(paste('The twitter file contains ', length(twitter), ' lines'))
## [1] "The twitter file contains 2360148 lines"
print(paste('The news file contains ', length(news), ' lines'))
## [1] "The news file contains 77259 lines"
print(paste('The blogs file contains ', length(blogs), ' lines'))
## [1] "The blogs file contains 899288 lines"
# Count words per file
print(paste(
'The twitter file contains ', sum(stri_count_words(twitter)), ' words',
' with an average of ', mean(stri_count_words(twitter)), ' per tweet'
))
## [1] "The twitter file contains 30218166 words with an average of 12.8035046954683 per tweet"
print(paste(
'The news file contains ', sum(stri_count_words(news)), ' words',
' with an average of ', mean(stri_count_words(news)), ' per article'
))
## [1] "The news file contains 2693898 words with an average of 34.8684036811245 per article"
print(paste(
'The blog file contains ', sum(stri_count_words(blogs)), ' words',
' with an average of ', mean(stri_count_words(blogs)), ' per blog'
))
## [1] "The blog file contains 38154238 words with an average of 42.4271623773474 per blog"
A corpus is made by combining samples of equal numbers of items
set.seed(42)
data_sample <- c(sample(twitter, 1000),
sample(blogs, 1000),
sample(news, 1000))
rm(twitter, blogs, news)
And a vcorpus is made
corpus <- VCorpus(VectorSource(data_sample))
In cleaning it is found that there are file names that can be removed, twitter handles and hashtags. Next non-regular characters are removed, non-letters, everything is put to lower, numbers are removed, whitespace stripped, stopwords are removed, etc.
replace_string <- function(x, pattern) gsub(pattern, " ", x)
replace_regex <- function(x, pattern) str_replace_all(x, pattern, " ")
delete_regex <- function(x, pattern) str_replace_all(x, pattern, "")
replace_contraction_error <- function(doc) {
doc <- gsub("nâ???Tt", "n't", doc)
doc <- gsub("â???Tll", "'ll", doc)
doc <- gsub("â???Tre", "'re", doc)
doc <- gsub("â???Tve", "'ve", doc)
doc <- gsub("â???Tm", "'m", doc)
doc <- gsub("itâ???Ts", "it's", doc) # a special case of 's
return(doc)
}
clean_string <- function(x){
x <- tolower(x)
x <- replace_contraction_error(x)
x <- removeWords(x, stopwords("en"))
x <- replace_string(x, "(f|ht)tp(s?)://(.*)[.][a-z]+") # remove files
x <- replace_string(x, "@[^\\s]+") # remove twitter handles
x <- replace_regex(x, "#[\\w|\\d]*") # remove hashtags
x <- delete_regex(x, "[^\\w|\\s]*") # remove everything but letters and whitespace
x <- replace_regex(x, "[^ -~]") # remove none utf-8 characters
x <- removeNumbers(x)
x <- stripWhitespace(x)
x <- delete_regex(x, "^\\s*") # remove leading whitespaces
x <- delete_regex(x, "\\s*$") # remove trailing witespaces
x
}
The cleaning function is used to clean the corpus
clean_corpus <- tm_map(corpus, clean_string)
clean_corpus <- tm_map(clean_corpus, PlainTextDocument)
# rm(corpus)
It can be seen that there are unigrams (words) that have up to 299 different occurences (remember, after removing stopwords). ‘said’ and ‘will’ are the one with the most occurences.
unigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtm_unigram <- DocumentTermMatrix(clean_corpus, control = list(tokenize = unigram_tokenizer))
n_docs <- nDocs(dtm_unigram)
dtm_unigram <- removeSparseTerms(dtm_unigram, (n_docs - 1.1) / n_docs)
tbl_dtm_unigram <- as_tibble(as.matrix(dtm_unigram))
tbl_frequencies <- summarise_all(tbl_dtm_unigram, funs(sum))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
unigram_frequencies <- unlist(tbl_frequencies)
rm(dtm_unigram)
rm(tbl_dtm_unigram)
rm(tbl_frequencies)
hist(
unigram_frequencies, breaks = 30,
xlab='Number of word occurances',
ylab='Number of words with given frequency of occurance',
main='Histogram of unigram (word) occurance'
)
hist(
unigram_frequencies[which(unigram_frequencies > 100)],
xlab='Number of word occurances',
ylab='Number of words with given frequency of occurance',
main='Histogram of unigrams (words) with more than 100 occurences'
)
print('the unigrams with more than 200 occurences are:')
## [1] "the unigrams with more than 200 occurences are:"
print(unigram_frequencies[which(unigram_frequencies > 200)])
## just like one said will
## 219 215 246 299 272
Most bigrams have viewer than 3 occurances (we left the 1 occurence out). There are 5 occurences of bi-grams with more than 15 occurences. All seems logical like ‘new york’ and ‘right now’.
bigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm_bigram <- DocumentTermMatrix(clean_corpus, control = list(tokenize = bigram_tokenizer))
n_docs = nDocs(dtm_bigram)
dtm_bigram <- removeSparseTerms(dtm_bigram, (n_docs - 1.1) / n_docs)
tbl_dtm_bigram <- as_tibble(as.matrix(dtm_bigram))
tbl_frequencies <- summarise_all(tbl_dtm_bigram, funs(sum))
bigram_frequencies <- unlist(tbl_frequencies)
rm(dtm_bigram)
rm(tbl_dtm_bigram)
rm(tbl_frequencies)
hist(
bigram_frequencies, breaks = 30,
xlab='#bigram occurances',
ylab='#bigrams with given occurance',
main='Histogram of bigram occurance'
)
hist(
bigram_frequencies[which(bigram_frequencies > 5)],
xlab='#bigram occurances',
ylab='#bigrams with given occurance',
main='Histogram of bigrams with more than 5 occurences'
)
print('the bigrams with more than 15 occurences are:')
## [1] "the bigrams with more than 15 occurences are:"
print(bigram_frequencies[which(bigram_frequencies > 15)])
## doesn t don t i m it s last week last year make sure
## 17 39 57 47 18 17 16
## new york right now
## 17 25
There are even 80 trigrams that occure twice and 6 that occure even more often.
trigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm_trigram <- DocumentTermMatrix(clean_corpus, control = list(tokenize = trigram_tokenizer))
n_docs = nDocs(dtm_trigram)
dtm_trigram <- removeSparseTerms(dtm_trigram, (n_docs - 1.1) / n_docs)
tbl_dtm_trigram <- as_tibble(as.matrix(dtm_trigram))
tbl_frequencies <- summarise_all(tbl_dtm_trigram, funs(sum))
trigram_frequencies <- unlist(tbl_frequencies)
hist(
trigram_frequencies,
xlab='#trigram occurances',
ylab='#trigrams with given occurance',
main='Histogram of trigram occurance'
)
hist(
trigram_frequencies[which(trigram_frequencies > 2)],
xlab='#trigram occurances',
ylab='#trigrams with given occurance',
main='Histogram of trigrams with more than 2 occurences'
)
print('the trigrams with more than 2 occurences are:')
## [1] "the trigrams with more than 2 occurences are:"
print(trigram_frequencies[which(trigram_frequencies > 2)])
## amazon services llc don t know don t want
## 4 8 3
## g protein g happy new year i m going
## 3 4 10
## i m sure it s just know it s
## 3 3 3
## llc amazon eu people don t services llc amazon
## 4 4 4
## think it s three years ago
## 3 3
In this dataset there are only 2 quadgram with more than 2 occurances, both seem to be the Amazon company name. Quadgrams therefore seem to be too much.
quadgram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
dtm_quadgram <- DocumentTermMatrix(clean_corpus, control = list(tokenize = quadgram_tokenizer))
n_docs = nDocs(dtm_quadgram)
dtm_quadgram <- removeSparseTerms(dtm_quadgram, (n_docs - 1.1) / n_docs)
tbl_dtm_quadgram <- as_tibble(as.matrix(dtm_quadgram))
tbl_frequencies <- summarise_all(tbl_dtm_quadgram, funs(sum))
quadgram_frequencies <- unlist(tbl_frequencies)
print(paste('The number of quadgrams with more than 2 occurance is', length(quadgram_frequencies[which(quadgram_frequencies > 2)])))
## [1] "The number of quadgrams with more than 2 occurance is 2"
quadgram_frequencies[which(quadgram_frequencies > 2)]
## amazon services llc amazon services llc amazon eu
## 4 4