Exploratory data analysis
In this section, we will understand the distribution of words and relationship between the words in the corpora, then we will be able to answer these questions.
- Some words are more frequent than others - what are the distributions of word frequencies?
- What are the frequencies of 2-grams and 3-grams in the dataset?
- How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?
- How do you evaluate how many of the words come from foreign languages?
- Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?
Loading the data
The corpora is charged to calculate basic statistics in each type of source: twitter, news and blogs.
f <- file.path(getwd(), "Coursera-SwiftKey.zip")
# Reading the files
de_twitter <- read.table(unz(f,"final/de_DE/de_DE.twitter.txt"), header=F, sep = "\n",stringsAsFactors = F)
de_news <- read.table(unz(f,"final/de_DE/de_DE.news.txt"), header=F, sep = "\n",stringsAsFactors = F)
de_blogs <- read.table(unz(f,"final/de_DE/de_DE.blogs.txt"), header=F, sep = "\n",stringsAsFactors = F)
words_blogs <- stri_count_words(de_blogs$V1)
words_news <- stri_count_words(de_news$V1)
words_twitter <- stri_count_words(de_twitter$V1)
size_blogs <- file.info("final/de_DE/de_DE.blogs.txt")$size/1024^2
size_news <- file.info("final/de_DE/de_DE.news.txt")$size/1024^2
size_twitter <- file.info("final/de_DE/de_DE.twitter.txt")$size/1024^2
basic_stats <- data.frame(filename = c("de_blogs","de_news","de_twitter"),
file_size_MB = c(size_blogs, size_news, size_twitter),
lines = c(length(de_blogs$V1),length(de_news$V1),length(de_twitter$V1)),
num_words = c(sum(words_blogs),sum(words_news),sum(words_twitter)),
mean_num_words = c(mean(words_blogs),mean(words_news),mean(words_twitter)))
basic_stats
Now, we need to take a sample to perform some more complex calculations.
# Taking a sample.
set.seed(85)
de_sample = rbind(de_twitter$V1[sample(length(de_twitter$V1), 1000)],
de_news$V1[sample(length(de_news$V1), 1000)],
de_blogs$V1[sample(length(de_blogs$V1), 1000)])
remove(de_twitter,de_news,de_blogs)
With the sample a Corpus is created, then we will remove profanity (numbers, punctations and multiple whitespace characters). After, the sparse terms are removed, the maximal allowed sparsity is the 0.999. This helps to remove words from other languages or very uncommon ones.
# Creting Corpus.
de_corpus <- VCorpus(VectorSource(de_sample))
# Removing profanity.
de_corpus <- tm_map(de_corpus, function(x) iconv(x, from='UTF-8', to="latin1"))
de_corpus <- tm_map(de_corpus, removeNumbers)
de_corpus <- tm_map(de_corpus, removePunctuation)
de_corpus <- tm_map(de_corpus, stripWhitespace)
de_corpus <- tm_map(de_corpus, PlainTextDocument)
# Creating a document-term matrix.
de_tdm <- TermDocumentMatrix(de_corpus)
nTerms(de_tdm)
[1] 35317
de_tdm <- removeSparseTerms(de_tdm, 0.999)
nTerms(de_tdm)
[1] 4732
Analyzing frequent words
We need to see the frequent words in our corpora. They might appear in our 2-grams, 3-grams and 4-grams. I’m not removing stop words, as the intention of the SwiftKey is to predict the next word to be typed.
# Finding frequent terms
de_freq <- sort(rowSums(as.matrix(de_tdm)),decreasing = T)
de_wc = data.frame(term=names(de_freq),frequency=de_freq)
de_wc[, 'cum_freq'] <- cumsum(de_wc[, 2])
# Number of words with more than 50% of instances
words_50 <- sum(de_wc$cum_freq < tail(de_wc$cum_fre,n=1)*0.5)
words_50
[1] 82
# Number of words with more than 90% of instances
words_90 <- sum(de_wc$cum_freq < tail(de_wc$cum_fre,n=1)*0.9)
words_90
[1] 2177
# Hitogram of frequent terms
p <- ggplot(subset(de_wc, frequency>1000), aes(x=reorder(term, frequency),y=frequency))
p <- p + geom_bar(aes(fill = frequency),stat="identity") + coord_flip() +xlab('words')
p

# Wordcloud
wordcloud(names(de_freq),de_freq, min.freq=300, colors=brewer.pal(6,"Accent"))

Analyzing frequent n-grams
So far, we have explored the behaviour of individual words in the corpora. Time to see 2-grams and 3-grams.
# Creating tokenizers.
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
# 2-grams
de_tdm_2g <- TermDocumentMatrix(de_corpus, control=list(tokenize=BigramTokenizer))
de_tdm_2g <- removeSparseTerms(de_tdm_2g, 0.999)
# Finding frequent terms
de_freq_2g <- sort(rowSums(as.matrix(de_tdm_2g)),decreasing = T)
findFreqTerms(de_tdm_2g,lowfreq=100)
[1] "an der" "auch die" "auf dem" "auf den" "auf der" "auf die" "aus dem"
[8] "bei der" "das ist" "dass die" "für den" "für die" "in den" "in der"
[15] "in die" "mehr als" "mit dem" "mit der" "mit einem" "nicht mehr" "sich die"
[22] "über die" "um die" "und der" "und die" "von der"
de_wc_g = data.frame(term=names(de_freq_2g),occurrences=de_freq_2g)
# Wordcloud
wordcloud(names(de_freq_2g),de_freq_2g, min.freq=75, colors=brewer.pal(6,"Accent"))

# 3-grams
de_tdm_3g <- TermDocumentMatrix(de_corpus, control=list(tokenize=TrigramTokenizer))
de_tdm_3g <- removeSparseTerms(de_tdm_3g, 0.999)
# Finding frequent terms
de_freq_3g <- sort(rowSums(as.matrix(de_tdm_3g)),decreasing = T)
findFreqTerms(de_tdm_3g,lowfreq=12)
[1] "auf jeden fall" "das ist ein" "den vergangenen jahren"
[4] "die zahl der" "im vergangenen jahr" "in den letzten"
[7] "in den nächsten" "in den vergangenen" "in diesem jahr"
[10] "nach wie vor" "sich in den" "sich in der"
de_wc_g = data.frame(term=names(de_freq_3g),occurrences=de_freq_3g)
# Wordcloud
wordcloud(names(de_freq_3g),de_freq_3g, min.freq=50, scale = c(2,.25) , max.words=10,colors=brewer.pal(3,"Accent"))

