Introduction

This is the week 2 milestone Report for the Data Science Capstone. The goal of this project is just to display that I’ve gotten used to working with the data and that I am on track to create my prediction algorithm. This document, in Rpubs, explains my exploratory analysis and my goals for the eventual app and algorithm, also, explains only the major features of the data I have identified and briefly summarize my plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

Import Data

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(stringi)
library(tm)
## Loading required package: NLP
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)
library(quanteda)
## quanteda version 0.9.9.50
## Using 5 of 6 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:utils':
## 
##     View
# Setup parallel clusters to accelarate execution time
#jobcluster <- makeCluster(detectCores())
#invisible(clusterEvalQ(jobcluster, library(tm)))
#invisible(clusterEvalQ(jobcluster, library(stringi)))
#invisible(clusterEvalQ(jobcluster, library(wordcloud)))



setwd("D:/Google Drive/Vicente/Estudios/Final project Data Science/")
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Some Details

stats <- data.frame(File = c("blogs", "news", "twitter"), t(rbind(sapply(list(blogs, news, twitter), stri_stats_general),TotalWords = sapply(list(blogs, news, twitter),stri_stats_latex)[4,])))

print(stats)
##      File   Lines LinesNEmpty     Chars CharsNWhite TotalWords
## 1   blogs  899288      899288 206824382   170389539   37570839
## 2    news   77259       77259  15639408    13072698    2651432
## 3 twitter 2360148     2360148 162096031   134082634   30451128

Sampling

Taking in account that the raw data is very huge, sampling is a good idea, before starting the analysis.

set.seed(666)
sample_size <- 0.05

blogs_index <- sample(seq_len(length(blogs)), length(blogs)*sample_size)
news_index <-  sample(seq_len(length(news)),length(news)*sample_size)
twitter_index <-  sample(seq_len(length(twitter)), length(twitter)*sample_size)

blogs_sample <- blogs[blogs_index[]]
news_sample <- news[news_index[]]
twitter_sample <-  twitter[twitter_index[]]

stats_samples <- data.frame(File = c("blogs_sample", "news_sample", "twitter_sample"), t(rbind(sapply(list(blogs_sample, news_sample, twitter_sample), stri_stats_general),TotalWords = sapply(list(blogs_sample, news_sample, twitter_sample),stri_stats_latex)[4,])))

print(stats_samples)
##             File  Lines LinesNEmpty    Chars CharsNWhite TotalWords
## 1   blogs_sample  44964       44964 10349414     8524676    1880991
## 2    news_sample   3862        3862   782265      653851     132679
## 3 twitter_sample 118007      118007  8097873     6698120    1521118

The samples have less data than the original.

Preprocessing: cleaning up the data

blogs.corpus <- corpus(blogs_sample)
news.corpus <- corpus(news_sample)
twitter.corpus <- corpus(twitter_sample)

docvars(blogs.corpus,"type") <- "blog"
docvars(news.corpus, "type") <- "news"
docvars(twitter.corpus, "type") <-  "twitter"

corpus <- blogs.corpus + news.corpus + twitter.corpus
feature.matrix <- dfm(corpus, remove_punct = TRUE, remove_numbers= TRUE, remove_symbols= TRUE, stem = TRUE)

textplot_wordcloud(feature.matrix, max.words=30, colors=brewer.pal(6,"Dark2"), scale = c(8, 0.8))

Top Features

top.features <- topfeatures(feature.matrix, n = 30)
top.features.df <- data.frame(top.features)
top.features.df["unigram"] <-  rownames(top.features.df)

ggplot(top.features.df, aes(x = reorder(unigram, -top.features), y = top.features))+geom_col() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Feature") + ylab("Count")

Creating the N-grams

Bigrams

bigram.frequency.threshold <- 10
bigram.top.feature.count <- 30

dfm.bigram <- dfm(corpus, removePunct= TRUE, stem = TRUE, ngrams=2)
## Warning: argument "removePunct" is deprecated: use "remove_punct" instead.
## Warning in dfm.tokenizedTexts(temp, tolower = tolower, stem = stem, select
## = select, : Arguments removePunctngrams not used.
## Warning in dfm.dfm(result, tolower = FALSE, stem = stem, select = select, :
## Arguments removePunctngrams not used.
bigram.frequency <- colSums(dfm.bigram)

bigram.frequency <- sort(bigram.frequency, decreasing = TRUE)

bigram.frequency.pruned <- as.numeric()

for (i in 1:length(bigram.frequency)) {
    if (bigram.frequency[i] > bigram.frequency.threshold) {
        bigram.frequency.pruned <- c(bigram.frequency.pruned, bigram.frequency[i])
    }
}

bigram.df <- data.frame(bigram.frequency.pruned[seq(bigram.top.feature.count)])
names(bigram.df) <- "bigram.count"
bigram.df["bigram"] <- rownames(bigram.df)

ggplot(bigram.df, aes(x = reorder(bigram, -bigram.count), y = bigram.count)) + 
    geom_bar(position = "identity", stat = "identity", fill = "darkblue") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Bigram") + 
    ylab("Count")

Trigrams

trigram.frequency.threshold <- 10
trigram.top.feature.count <- 30

dfm.trigram <- dfm(corpus, remove_punct= TRUE, stem = TRUE, ngrams = 3)


trigram.frequency <- colSums(dfm.trigram)
trigram.frequency <- sort(trigram.frequency, decreasing = TRUE)
trigram.frequency.pruned <- as.numeric()
for (i in 1:length(trigram.frequency)) {
    if (trigram.frequency[i] > trigram.frequency.threshold) {
        trigram.frequency.pruned <- c(trigram.frequency.pruned, trigram.frequency[i])
    }
}


trigram.df <- data.frame(trigram.frequency.pruned[seq(trigram.top.feature.count)])
names(trigram.df) <- "trigram.count"
trigram.df["trigram"] <- rownames(trigram.df)

ggplot(trigram.df, aes(x = reorder(trigram, -trigram.count), y = trigram.count)) + 
    geom_bar(position = "identity", stat = "identity", fill = "darkgray") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Trigram") + 
    ylab("Count")

Plans for the prediction model

I was testing the quanteda library to develop a prediction model. However I can observe that this library requiere a lot of resources in time porcessing as size. For example the dfm.trigram = 206.3 Mb.

I must to find and define the best model to the trade-off between accuracy and efficiency (prediction time and RAM used). Indeed, for a smartphone keyboard application, it is pointless to gain 5% accuracy at the cost of a 5 seconds prediction time.