Milestone Report

Setting up the Environment

library(tm)

## Warning: package 'tm' was built under R version 3.6.3

## Loading required package: NLP

library(tokenizers)

## Warning: package 'tokenizers' was built under R version 3.6.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

library(stringr)

## Warning: package 'stringr' was built under R version 3.6.3

library(quanteda)

## Warning: package 'quanteda' was built under R version 3.6.3

## Package version: 2.1.2

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-

## The following object is masked from 'package:utils':
## 
##     View

library(NLP)
library(rJava)

## Warning: package 'rJava' was built under R version 3.6.3

library(RWeka)

## Warning: package 'RWeka' was built under R version 3.6.3

library(openNLP)

## Warning: package 'openNLP' was built under R version 3.6.3

Downloading the Data

This report contains my exploratory data analysis and summation of the initial findings from a data set of US Tweets, Blogs, and News. The original data set was downloaded via a zip file from the below link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.

# Setting the working directory as the en_US folder
setwd('C:/Users/Cristhian/Documents/final/en_US/')

list.files()

## [1] "en_US.blogs.txt"   "en_US.corpus.rds"  "en_US.corpus.txt" 
## [4] "en_US.news.txt"    "en_US.twitter.txt" "milestones.html"  
## [7] "milestones.Rmd"    "Profanities.txt"

Reading the data

As a first step in this process was to create three different objects for each of the US text files, which are blogs, news and twitter. The files are rather large so they will take a few minutes to read in.

# Importing the datasets
twitter <- readLines(file("en_US.twitter.txt", "r"))

## Warning in readLines(file("en_US.twitter.txt", "r")): line 167155 appears to
## contain an embedded nul

## Warning in readLines(file("en_US.twitter.txt", "r")): line 268547 appears to
## contain an embedded nul

## Warning in readLines(file("en_US.twitter.txt", "r")): line 1274086 appears to
## contain an embedded nul

## Warning in readLines(file("en_US.twitter.txt", "r")): line 1759032 appears to
## contain an embedded nul

news <- readLines(file("en_US.news.txt", "r"))

## Warning in readLines(file("en_US.news.txt", "r")): incomplete final line found
## on 'en_US.news.txt'

blog <- readLines(file("en_US.blogs.txt", "r"))

Dataset Examination

Below i will summarize the findings from the dataset. This includes file sizes, line counts, word counts, and mean words per line etc.

# Generating summary table of datasets
dfsum <- as.data.frame(c("Twitter","News","Blog"))
lines <- function(x) {
    length(x)
}
words <- function(x) {
    require(stringr)
    sum(str_count(x))
}
size <- function(x) {
    object.size(x)
}

dfsum$lines <- c(lines(twitter), lines(news), lines(blog))
dfsum$words <- c(words(twitter), words(news), words(blog))
dfsum$Size <- c(size(twitter), size(news), size(blog))
colnames(dfsum) <- c("Text", "Lines", "Words", "Size (mb)")

dfsum

##      Text   Lines     Words Size (mb)
## 1 Twitter 2360148 162384825 334484736
## 2    News   77259  15683765  20729472
## 3    Blog  899288 208361438 267758632

Sampling data

Because these datasets are large, we can extract subsets of them to create a sample of all the texts. The datasets were combined into a single file then 5% of the text lines were extracted to form the sample text that will be used to create the models. I first attempted to create a dataset with 10% of the text but the object was too big. Hence, 5% of the data was used only.

# Creating a sample from these datasets (5%)
set.seed(444)
Text <- c(twitter, news, blog)
Text_Sample <- Text[which(rbinom(Text, 1, .05)==1)]

# Sample statistics
samplesum <- as.data.frame(c("Lines", "Words", "Size (mb)"))
samplesum$Summary <- c(length(Text_Sample), sum(str_count(Text_Sample)), object.size(Text_Sample))
colnames(samplesum) <- c("Summary", "")

samplesum

##     Summary         
## 1     Lines   166392
## 2     Words 19177966
## 3 Size (mb) 31153008

# Removing old datasets
rm(list=c("blog", "news", "twitter"))

Data Cleaning

We need to remove profanities and tokenize the data.

cleancorpus <- function(dataset) {
    # Turning the dataset into a corpus
    text <- VCorpus(VectorSource(dataset))
    
    #Creating a function to remove special characters
    toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
    
    #Removing special characters
    text <- tm_map(text, toSpace,"(f|ht)tp(s?)://(.*)[.][a-z]+")
    text <- tm_map(text, toSpace, "@[^\\s]+")
    text <- tm_map(text, toSpace, "\\b[A-Z a-z 0-9._ -]*[@](.*?)[.]{1,3} \\b")
    
    #Profanity filter
    download.file("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt", destfile = "Profanities.txt")
    con <- file("Profanities.txt", open = "r")
    profanity <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
    close(con)
    profanity <- iconv(profanity, "latin1", "ASCII", sub = "")
    text <- tm_map(text, removeWords, profanity)
    
    #Cleaning the corpus
    text <- tm_map(text, tolower)
    text <- tm_map(text, removeWords, stopwords("english"))
    text <- tm_map(text, removePunctuation)
    text <- tm_map(text, removeNumbers)
    text <- tm_map(text, stripWhitespace)
    text <- tm_map(text, PlainTextDocument)
    return(text)
}

# Create the final corpus and save
corpus <- cleancorpus(Text_Sample)
saveRDS(corpus, file = "en_US.corpus.rds")

# Turn the corpus into a dataframe
corpusText <- data.frame(text = unlist(sapply(corpus, '[', "content")), stringsAsFactors = FALSE)
con <- file("en_US.corpus.txt", open = "w")
writeLines(corpusText$text, con)
close(con)

Exploratory Analysis

Before creating the prediction model, we need to create n-grams (1, 2, and 3) to see the most common combination of words. ## Unigram.

Unigram

#Unigram
unigramToken <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = unigramToken))
unigramMatrixFreq <- sort(rowSums(as.matrix(removeSparseTerms(unigramMatrix, 0.99))), decreasing = TRUE)
unigramMatrixFreq <- data.frame(word = names(unigramMatrixFreq), freq = unigramMatrixFreq)

# Creating a plot with the most common unigrams
uniplot <- ggplot(unigramMatrixFreq[1:20,], aes(x = reorder(word, -freq), y = freq)) + 
    geom_bar(stat = "identity", fill = I("grey")) +
    geom_text(aes(label = freq ), vjust = -0.20, size = 3) + 
    xlab("") + 
    ylab("Frequency") + 
    theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
               axis.text.x = element_text(hjust = 1.0, angle = 45),
               axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) + 
    ggtitle("20 Most Common Unigrams")

print(uniplot)

Bigram

bigramToken <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = bigramToken))
bigramMatrixFreq <- sort(rowSums(as.matrix(removeSparseTerms(bigramMatrix, 0.999))), decreasing = TRUE)
bigramMatrixFreq <- data.frame(word = names(bigramMatrixFreq), freq = bigramMatrixFreq)


biplot <- ggplot(bigramMatrixFreq[1:20,], aes(x = reorder(word, -freq), y = freq)) + 
    geom_bar(stat = "identity", fill = I("grey")) + 
    geom_text(aes(label = freq ), vjust = -0.20, size = 3) + 
    xlab("") + 
    ylab("Frequency") + 
    theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
               axis.text.x = element_text(hjust = 1.0, angle = 45),
               axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) + 
    ggtitle("20 Most Common Bigrams")

print(biplot)

Trigram

trigramToken <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = trigramToken))
trigramMatrixFreq <- sort(rowSums(as.matrix(removeSparseTerms(trigramMatrix, 0.9999))), decreasing = TRUE)
trigramMatrixFreq <- data.frame(word = names(trigramMatrixFreq), freq = trigramMatrixFreq)

# generate plot
triplot <- ggplot(trigramMatrixFreq[1:20,], aes(x = reorder(word, -freq), y = freq)) + 
    geom_bar(stat = "identity", fill = I("grey")) +
    geom_text(aes(label = freq ), vjust = -0.20, size = 3) + 
    xlab("") +
    ylab("Frequency") + 
    theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
               axis.text.x = element_text(hjust = 1.0, angle = 45),
               axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) + 
    ggtitle("20 Most Common Trigrams")
print(triplot)

Next Steps

As a next step i will create a model and integrate into a Shiny app for word prediction. The most likely method for myself to complete this is to proceed seems to be to create a model using n-grams and Markov chains.

I will use the Ngram dataframes created to calculate the probability of the next word occuring. The input string will be tokenized and the last 2 words will be isolated and cross checked against the data frames to get the highest probability of the next word.

Lastly, I will then integrated into a shiny application that will provide a simple and intuitive front end for the end users.

The key steps are detailed below: .Using N-grams to generate tokens of one to three words. .Summarizing frequency of tokens and find association between tokens. .Building predictive models using the tokens. .Develop data product using a shiny app to make word recommendation based on user inputs.

Milestone Report: Exploratory Data Analysis

Cristhian Toribio Amaro

1/10/2020

Objectvie