Data Science Capstone - Week 2 Project

Overview

The main objective is to build the sample corpus, find the 2-gram and 3-gram term document matrix and perform exploratory analysis on the words. The data is available to be downloaded from

https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Archive’s content:

“en_US.blogs.txt”
“en_US.news.txt”
“en_US.twitter.txt”

Due to computational performance issues I could took only 3000 records.

Data Preparation

library(tm)

## Warning: package 'tm' was built under R version 3.4.4

## Loading required package: NLP

library(NLP)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

bigram_tokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
}

trigram_tokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
}

data_blog <- readLines("week2/final/en_US/en_US.blogs.txt", skipNul = TRUE, n = 3000)
data_news <- readLines("week2/final/en_US/en_US.news.txt", skipNul = TRUE, n = 3000)
data_twitter <- readLines("week2/final/en_US/en_US.twitter.txt", skipNul = TRUE, n = 3000)

# Corpus aggregation.
data_vector <- c(data_blog, data_news, data_twitter)
data_corpus <- VCorpus(VectorSource(data_vector))

Blog lines 3000
News lines 3000
Twitter lines 3000

Data Cleaning and Exploratory Analysis

The content has been transformed to all lowercase and cleaned by removing:

Punctuation
Extra Whitespaces
Stopwords
Numbers

# Data cleaning.
removeRepeat <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
removeURL <- function(x) gsub("http[[:alnum:]]*","", x)

data_corpus <- tm_map(data_corpus, content_transformer(tolower))
data_corpus <- tm_map(data_corpus, content_transformer(removeRepeat))
data_corpus <- tm_map(data_corpus, content_transformer(removeURL))
data_corpus <- tm_map(data_corpus, removePunctuation)
data_corpus <- tm_map(data_corpus, removeNumbers)
data_corpus <- tm_map(data_corpus, stripWhitespace)
data_corpus <- tm_map(data_corpus, removeWords, stopwords("english"))

A Term Document Matrix has been created to rank words.

1 Gram

one_gram_dtm <- TermDocumentMatrix(data_corpus)
one_gram_matrix = as.data.frame((as.matrix(one_gram_dtm))) 
one_gram_v <- sort(rowSums(one_gram_matrix), decreasing = TRUE)
one_gram_d <- data.frame(word = names(one_gram_v), freq = one_gram_v)

# Bar chart
ggplot(data = head(one_gram_d, 20), aes(x = reorder(word, -freq), y = freq)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

2 Gram

two_gram_dtm <- TermDocumentMatrix(data_corpus, control = list(tokenize = bigram_tokenizer))
two_gram_matrix = as.data.frame((as.matrix(two_gram_dtm))) 
two_gram_v <- sort(rowSums(two_gram_matrix), decreasing = TRUE)
two_gram_d <- data.frame(word = names(two_gram_v), freq = two_gram_v)

# Bar chart
ggplot(data = head(two_gram_d, 20), aes(x = reorder(word, -freq), y = freq)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

3 Gram

three_gram_dtm <- TermDocumentMatrix(data_corpus, control = list(tokenize = trigram_tokenizer))
three_gram_matrix = as.data.frame((as.matrix(three_gram_dtm))) 
three_gram_v <- sort(rowSums(three_gram_matrix), decreasing = TRUE)
three_gram_d <- data.frame(word = names(three_gram_v), freq = three_gram_v)

# Bar chart
ggplot(data = head(three_gram_d, 20), aes(x = reorder(word, -freq), y = freq)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Up Next

Make a larger corpus size from the original data and tokenize it with 2 Gram and 3 Gram;
Optimize the code to allow faster processing;
Make a prediction algorithm by comparing the input to the 2 Gram and 3 Gram matrix.