Overview

Environment Setup

options(java.parameters = "-Xmx8000m")

library(stringi)
library(tm)
library(RWeka)
library(openNLP)
library(qdap)
library(ggplot2)

Data Acquisition

Loading the Dataset

  • Since the datasets are huge and takes a while to download, the download and unzip part of the procedures are done outside this document. Hence this document references directly to a downloaded directory
  • The below code snippet reads each line of the dataset provided
get_raw_data <- function(file_path) {
    # Open the file and read lines from the file
    connection <- file(file_path, "r")
    file_lines <- readLines(connection, skipNul = TRUE)
    close(connection)
    
    file_lines
}

data_blogs <- get_raw_data("./data/final/en_US/en_US.blogs.txt")
data_news <- get_raw_data("./data/final/en_US/en_US.news.txt")
data_twitter <- get_raw_data("./data/final/en_US/en_US.twitter.txt")

Dataset basic summary

  • The below code snippet prints the line and word count of each dataset
  • The prints below gives a fair indication of the size of these HUGE datasets
get_raw_data_summary <- function(file_lines) {
    # Get line and word counts from file_lines
    line_count <- length(file_lines)
    
    word_count <- 0
    for(line in file_lines) {
        word_count <- word_count + stri_count(line, regex="\\S+")
    }
    
    print(paste("Line Count = ", line_count))
    print(paste("Word Count = ", word_count))
}

get_raw_data_summary(data_blogs)
## [1] "Line Count =  899288"
## [1] "Word Count =  37334131"
get_raw_data_summary(data_news)
## [1] "Line Count =  1010242"
## [1] "Word Count =  34372530"
get_raw_data_summary(data_twitter)
## [1] "Line Count =  2360148"
## [1] "Word Count =  30373583"

Data Processing

Dataset Sampling

  • We can subset the raw data by performing some sampling on that
  • We can flip a biased coin using “rbinom()” and select only few random lines for processing
  • The below code snippet performs such sampling on the raw datasets
sample_raw_data <- function(file_lines) {
    line_count <- length(file_lines)

    set.seed(100)
    sampled <- file_lines[rbinom(line_count, 1,  0.0001) == 1]
    
    sampled
}

get_sampled_data <- function(data_blogs, data_news, data_twitter) {
    # Sample the raw data
    sampled_data_blogs <- sample_raw_data(data_blogs)
    sampled_data_news <- sample_raw_data(data_news)
    sampled_data_twitter <- sample_raw_data(data_twitter)
    
    # Merge three sampled data into one master sample data
    sampled_data <- paste(sampled_data_blogs, sampled_data_news, sampled_data_twitter)
    
    # Perform some clean ups before tokenization
    sampled_data <- gsub("a.m", "", sampled_data)
    sampled_data <- gsub("p.m", "", sampled_data)
    sampled_data <- gsub("\\$", "", sampled_data)
    
    # Split text paragraphs into sentences
    sampled_data <- sent_detect(sampled_data, language = "en", model = NULL)
    
    sampled_data
}

sampled_data <- get_sampled_data(data_blogs, data_news, data_twitter)

Dataset Tokenization

  • The raw datasets are a free flowing character vectors containing numbers, white spaces, punctuations etc. These can be removed out for further processing
  • Stop words in English can also be filtered out since those are expected to be the most frequently used words in the dataset
  • The below code snippet performs such cleanup activities
tokenize_data <- function(sampled_data, db_type) {
    # Create corpus and clean the data
    corpus <- VCorpus(VectorSource(sampled_data))
    corpus <- tm_map(corpus, removeNumbers)
    corpus <- tm_map(corpus, stripWhitespace)
    corpus <- tm_map(corpus, tolower)
    corpus <- tm_map(corpus, removePunctuation, ucp = TRUE)
    corpus <- tm_map(corpus, removeWords, stopwords("english"))
    
    corpus
}

tokenized_corpus <- tokenize_data(sampled_data)

Dataset Profanity Cleanup

  • It’s better to remove profanity words for further processing as profanity words shouldn’t be outputs from our prediction algorithms
  • Below code snippet performs profanity word cleanup. “profanity.txt” was obtained from the URL mentioned below which is used as a reference of profanity words to remove
remove_profanity_words <- function(tokenized_corpus, db_type) {
    #Profanity words downloaed from below URL
    #https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en
    connection <- file("./data/references/profanity.txt", "r")
    profanity_vector <- VectorSource(readLines(connection))
    close(connection)
    
    clean_corpus <- tm_map(tokenized_corpus, removeWords, profanity_vector)
    
    clean_corpus
}

clean_corpus <- remove_profanity_words(tokenized_corpus)

Data Exploration

Creation of Ngram Datasets

  • Lets create 1gram, 2gram and 3gram datasets from the sampled and cleaned dataset
  • The below code snippet is creating such datasets. 1gram, 2gram and 3gram are created as a data frame which can be useful for plots.
  • Only the top 20 most frequent word or sequence of words are retained in the dataframe
create_ngram <- function(clean_corpus, gram) {
    ngram <- NGramTokenizer(clean_corpus, Weka_control(min = gram, max = gram, delimiters = " \\r\\n\\t.,;:\"()?!"))

    # Create a Dataframe
    df <- data.frame(table(ngram))
    ngram_df <- df[order(df$Freq,decreasing = TRUE),]

    # Pick only the top most 20 most frequently occured sequences
    ngram_df <- ngram_df[1:20,]
    colnames(ngram_df) <- c("Word_Sequence", "Frequency")

    ngram_df
}

one_gram <- create_ngram(clean_corpus, 1)
two_gram <- create_ngram(clean_corpus, 2)
three_gram <- create_ngram(clean_corpus, 3)

Plotting Ngram Datasets

  • Lets plot 1gram, 2gram and 3gram data frames from previous step. By doing so, we can get a fair idea on the sequence of word(s) which are occuring frequently
  • The below code snippet creates a Bar chart for each gram
plot_ngram <- function(ngram) {
    ngram_plot <- ggplot(ngram, aes(x = Word_Sequence,y = Frequency)) +
                  geom_bar(stat = "Identity", fill = "Blue") +
                  geom_text(aes(label = Frequency), vjust = -0.20) +
                  theme(axis.text.x = element_text(angle = 45, hjust = 1))

    print(ngram_plot)
}

plot_ngram(one_gram)

plot_ngram(two_gram)

plot_ngram(three_gram)

Next Steps