Introduction

Hi, dear reader. this is milestone report for Capstone Project of the Data Science Coursera specialization. It should show to you that my humble person did exploratory analysis and successful understand most of the task. So the current task was download archive with 3 files for this task from here

There was also task to show you, dear reader, that I successfully downloaded this archive and familiarize with content. I wouldn’t add here code responsible for downloading of that archive. I hope you will believe me that I downloaded it by my next conclusions.

So, that archive content folders with different locales, and we interested only in “en_US” folder.

There are 3 files: - en_US.blogs.txt - en_US.news.txt - en_US.twitter.txt

Each file contains text lines. About then we will talk a little bit later. My exploratory analysis continued very long and I used a lot of different libraries

Used libraries

library(qdap)
library(tm)
library(wordcloud)
library(dendextend)
library(RWeka)
library(R.utils)
library(data.table)
library(LaF)
library(dplyr)
library(stringr)
library(ggplot2)

Created functions

During my work, I do a lot of repeatable actions. In reason I very lazy and to make my code more clearly – I create some functions. But I during my work I understand that some of them are useless. But I deside to leave it here. Maybe it helps someone.

clean_corpus <- function(corpus){
    corpus <- tm_map(corpus, removeNumbers) 
    corpus <- tm_map(corpus, stripWhitespace) 
    corpus <- tm_map(corpus, tolower) 
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, removeWords, c(stopwords("en"))  ) 
    corpus
}

light_clean <- function(text){
    
        text <- tolower(text)
        text <- removeNumbers(text)
        text <- removePunctuation(text)
        text <- replace_symbol(text)
        text <- replace_ordinal(text)
        text <- replace_number(text)
        text <- replace_contraction(text)
    text
} 

read_random_lines <- function(filePath, PercentOfRandomLines = 1){
    lines_in_file <- as.numeric(countLines(filePath))
    how_much_lines <- lines_in_file * PercentOfRandomLines/100
    random_lines_from_file <- sample_lines(filePath, how_much_lines, lines_in_file)
    random_lines_from_file
} 

quantityOfUniqueWords <- function(text){
    temp <- word_list(text)
    unique_words<- nrow(as.data.frame(temp$fwl))
    unique_words
}

getNgram <- function(text, NgramNum){
    ngram <- NGramTokenizer(text, Weka_control(min = NgramNum, max = NgramNum,delimiters = " \\r\\n\\t.,;:\"()?!"))
    ngram_df <- as.data.frame(table(ngram))
    ngram_ordered <- ngram_df[order(ngram_df$Freq,decreasing = TRUE),]
    names(ngram_ordered) <- c("word","Freq")
    ngram_ordered
}

print_comparing_bar_plot_deprecated <- function(dirtyNgram, cleanNgram){
    oneGram_dfR <- rbind(head(dirtyNgram,10), head(cleanNgram,10))
    oneGram_dfR <- cbind(oneGram_dfR,"cleaned_corpus" = rep(c(FALSE, TRUE), each=10))
    oneGram_dfR$cleaned_corpus <- as.factor(oneGram_dfR$cleaned_corpus)
    
    p <- ggplot(oneGram_dfR) +
        geom_bar(aes(x = reorder(word, -Freq), y = Freq , fill = cleaned_corpus), 
                 stat="identity", position = "dodge", width = 0.7) +
        scale_fill_manual("Corpus type\n", values = c("red","blue"), 
                          labels = c("Dirty_corpus", "Cleaned_corpus")) +
        labs(x="words",y="frequency") +
        theme_bw(base_size = 14)
    p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

ngram_comparing_table <- function(dirtyNgram, cleanNgram) {
    nGram_dt <- as.data.table(cbind(head(dirtyNgram,10), head(cleanNgram,10)))
    names(nGram_dt) <- c("ngram_dirty", "freq_dirty", "ngram_clean", "freq_clean")
    nGram_dt
}

print_comparing_bar_plot <- function(dirtyNgram, cleanNgram){
    oneGram_dfR <- rbind(head(dirtyNgram,10), head(cleanNgram,10))
    oneGram_dfR <- cbind(oneGram_dfR,"cleaned_corpus" = rep(c(FALSE, TRUE), each=10))
    oneGram_dfR$cleaned_corpus <- as.factor(oneGram_dfR$cleaned_corpus)
    
    p <- ggplot(oneGram_dfR) +
        geom_histogram(aes(x = reorder(word, -Freq), y = Freq , fill = cleaned_corpus), 
                       stat="identity", position = "dodge", width = 0.7) +
        scale_fill_manual("Corpus type\n", values = c("red","blue"), 
                          labels = c("Dirty_corpus", "Cleaned_corpus")) +
        labs(x="words",y="frequency") +
        theme_bw(base_size = 14)
    p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

Preparations

At first I setup my working directory and define variables for interested files.

### setup working directory
setwd("e:/jhu")

### define variables for files
blogs_file <- "Coursera-Swiftkey/final/en_US/en_US.blogs.txt"
news_file <- "Coursera-Swiftkey/final/en_US/en_US.news.txt"
twitter_file <- "Coursera-Swiftkey/final/en_US/en_US.twitter.txt"
profanity_file <- "Coursera-Swiftkey/final/en_US/profanity.txt"

After I defined files I started search basic informations about files. such like: - file size - quantity of lines in each file - quantity of lines in each file - maximal quatity in one line for each file - how much word in each file - maximal quantity of words in one line

### get file size
blog_fi <- file.info(blogs_file)$size / 1024^2
news_fi <- file.info(news_file)$size / 1024^2
twit_fi <- file.info(twitter_file)$size / 1024^2

### read files
blogs <- readLines(blogs_file)
news <- readLines(news_file)
twitter <- readLines(twitter_file)


### make light clean
blogs_light_cleaned <- light_clean(blogs)
news_light_cleaned <- light_clean(news)
twit_light_cleaned <- light_clean(twitter)
union_text <- c(blogs_light_cleaned, news_light_cleaned, twit_light_cleaned)


### find quantity of lins in each file
blog_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)

### find where is maxium of chars in each line 
blog_maxChar_in_lines <- max(nchar(blogs))
news_maxChar_in_lines <- max(nchar(news))
twitter_maxChar_in_lines <- max(nchar(twitter))

###counting words in each line
blog_word_count_byRow <- word_count(blogs_light_cleaned, byrow = TRUE)
news_word_count_byRow <- word_count(news_light_cleaned, byrow = TRUE)
twit_word_count_byRow <- word_count(twit_light_cleaned, byrow = TRUE)


### find max quantity of words in 
blog_max_word_in_line <- max(blog_word_count_byRow, na.rm = TRUE)
news_max_word_in_line <- max(news_word_count_byRow, na.rm = TRUE)
twit_max_word_in_line <- max(twit_word_count_byRow, na.rm = TRUE)

### find quantity of words in each file
blog_all_quantity_words <- sum(blog_word_count_byRow, na.rm = TRUE)
news_all_quantity_words <- sum(news_word_count_byRow, na.rm = TRUE)
twit_all_quantity_words <- sum(twit_word_count_byRow, na.rm = TRUE)

Unique words

Also I tried find quantity of unique wrds in each file, But I got outOfMemoryException. So I decided find unique words only for 10 percent of each files and for uniou corpus. I One of my first conlusion was aboud excluding stopwords. So I decide to compare how much my random corpus of words contain unique words with and witour stopwords:

### take 10 persent part of each file
blog_persent <- sample(blogs_light_cleaned, blog_lines/10)
news_persent <- sample(news_light_cleaned, blog_lines/10)
twit_persent <- sample(twit_light_cleaned, blog_lines/10)
union_text_persent <- sample(union_text, length(union_text)/10)

### words before cleans
blog_word_list <- word_list(blog_persent)
news_word_list <- word_list(news_persent)
twit_word_list <- word_list(twit_persent)
union_text_word_list <- word_list(union_text_persent)

### here we count quantity of all words in uniout dataset
union_text_all_quantity_words <- sum(as.data.frame(union_text_word_list$fwl)[,2],na.rm = TRUE)


### define unique words for one persent of words before clean
blog_unique_words <- nrow(as.data.frame(blog_word_list$fwl))
news_unique_words <- nrow(as.data.frame(news_word_list$fwl))
twit_unique_words <- nrow(as.data.frame(twit_word_list$fwl))
union_text_unique_quantity_words <- nrow(as.data.frame(union_text_word_list$fwl))

### words after cleans stopwords
blog_word_list_cleaned <- word_list(blog_persent, stopwords = stopwords("english"))
news_word_list_cleaned <- word_list(news_persent, stopwords = stopwords("english"))
twit_word_list_cleaned <- word_list(twit_persent, stopwords = stopwords("english"))
union_text_word_list_cleaned <- word_list(union_text_persent, stopwords = stopwords("english"))

### define unique words for one persent of words after clean stopwords
blog_unique_words_cleaned <- nrow(as.data.frame(blog_word_list_cleaned$fswl))
news_unique_words_cleaned <- nrow(as.data.frame(news_word_list_cleaned$fswl))
twit_unique_words_cleaned <- nrow(as.data.frame(twit_word_list_cleaned$fswl))
union_text_unique_quantity_words_cleaned <- nrow(as.data.frame(union_text_word_list_cleaned$fswl))

Basic value

All geted valuesI put in aggregated table

### Create aggregated table
aggregated_table <- data.table( "corpus" = c("blogs", "news", "twitter", "union_corpus"),
                                "file_size(bytes)" = c(blog_fi, news_fi, twit_fi, NA),
                                "quantity_of_lines" = c(blog_lines, news_lines, twitter_lines, sum(c(blog_lines, news_lines,twitter_lines))),
                                "max_char_in_line" = c(blog_maxChar_in_lines, news_maxChar_in_lines, twitter_maxChar_in_lines, NA),
                                "quantity_of_words" = c(blog_all_quantity_words, news_all_quantity_words, twit_all_quantity_words, sum(c(blog_all_quantity_words, news_all_quantity_words, twit_all_quantity_words))),
                                "max_words_in_line" = c(blog_max_word_in_line, news_max_word_in_line,twit_max_word_in_line, NA),
                                "unique_words" = c(blog_unique_words,news_unique_words,twit_unique_words,union_text_unique_quantity_words),
                                "unique_words_without_stoplist" = c(blog_unique_words_cleaned, news_unique_words_cleaned,twit_unique_words_cleaned,union_text_unique_quantity_words_cleaned)
                                
)

And result of it you can see in the next table:

library(DT)
datatable(aggregated_table)

As you can see, difference in unique words with and without stopwords is nead littlemore than one hundred words for each courpus. But this 100 hundred words will be change a lot in picture of N-grams. It will be shows next.

Save results

Dear friend, if you decide to reproduce my resuts. Save your results now and clean your memory. You can use next code to do it.

save(aggregated_table, file = "aggregated_table.obj")
write(union_text, file = "union_text.txt")
rm(list = ls())

load(file = "aggregated_table.obj")
union_text <- readLines("union_text.txt")
union_text_persent <- sample(union_text, length(union_text)/10)

Most frequently words and phrases

I’m impressed. You still read :)

Ok. Now I decide to find most frequently words and phrases. But I faced with problem: Should I find most frequently words with or without stop words. This question very worry me. So I decide to compare results.

At first I union all three files into one corpus. You shoud saw it before. it was something like this:

union_text <- c(blogs_light_cleaned, news_light_cleaned, twit_light_cleaned)

after it I decide to divide one corpus to 2 and take random 20000 of lines from it. And one corpus will be without cleaning of stopwords and one I clened from stopwords. Also I decide not cleaned it from profanity words, because there are also big part of language. And if it will be necessary I can do it in future.

### create dirty(with stopwords and clean corpus)
union_dirty_persent <- sample(union_text_persent, 20000)
unioum_text_cleaned <- tolower(unioum_text_cleaned)
unioum_text_cleaned <- removeWords(union_text_persent, c(stopwords("english"), "I", "the"))
union_cleaned_persent <- sample(unioum_text_cleaned, 20000)

OK. Courpuses are ready. I took one of my function and get next N-grams - OneGram – contains most frequently words (for dirty and for cleaned corpus) - BiGram – contains most frequently phases (2 words) (for dirty and for cleaned corpus) - TriGram – contains most frequently phases (3 words) (for dirty and for cleaned corpus)

Onegram_dirty <- getNgram(union_dirty_persent, 1)
Bigram_dirty <- getNgram(union_dirty_persent, 2)
Trigram_dirty <- getNgram(union_dirty_persent, 3)

Onegram_cleaned <- getNgram(union_cleaned_persent, 1)
Bigram_cleaned <- getNgram(union_cleaned_persent, 2)
Trigram_cleaned <- getNgram(union_cleaned_persent, 3)

Comparing Ngrams

Nest I start viewing at ngrams and start comparing top 10 of each in table.

oneGram_comparing_dt <- ngram_comparing_table(Onegram_dirty, Onegram_cleaned)
BiGram_comparing_dt <- ngram_comparing_table(Bigram_dirty, Bigram_cleaned)
TriGram_comparing_dt <- ngram_comparing_table(Trigram_dirty, Trigram_cleaned)

Next you can find comparing table and comparing visualization

Most frequently words (OneGram)

datatable(oneGram_comparing_dt)
print_comparing_bar_plot(Onegram_dirty, Onegram_cleaned)

Most frequently BiGram

datatable(BiGram_comparing_dt)
print_comparing_bar_plot(Bigram_dirty, Bigram_cleaned)

Most Frequently TriGram

datatable(TriGram_comparing_dt)
print_comparing_bar_plot(Trigram_dirty, Trigram_cleaned)

Conclusions:

As you can see our 100 of stopwords (RED) are very frequntly. Even much more frequently than ordinal words (blue). So my biggest result of this exploratory analysis will be do not exclude stop words from model. Because there are very big probability that next word will be one of the stopword. At least it will be helps base model to save memory and computing power of pc for predictions.

Next steps