This is the week 2 milestone Report for the Data Science Capstone. The goal of this project is just to display that I’ve gotten used to working with the data and that I am on track to create my prediction algorithm. This document, in Rpubs, explains my exploratory analysis and my goals for the eventual app and algorithm, also, explains only the major features of the data I have identified and briefly summarize my plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(stringi)
library(tm)
## Loading required package: NLP
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)
library(quanteda)
## quanteda version 0.9.9.50
## Using 5 of 6 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:utils':
##
## View
# Setup parallel clusters to accelarate execution time
#jobcluster <- makeCluster(detectCores())
#invisible(clusterEvalQ(jobcluster, library(tm)))
#invisible(clusterEvalQ(jobcluster, library(stringi)))
#invisible(clusterEvalQ(jobcluster, library(wordcloud)))
setwd("D:/Google Drive/Vicente/Estudios/Final project Data Science/")
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
stats <- data.frame(File = c("blogs", "news", "twitter"), t(rbind(sapply(list(blogs, news, twitter), stri_stats_general),TotalWords = sapply(list(blogs, news, twitter),stri_stats_latex)[4,])))
print(stats)
## File Lines LinesNEmpty Chars CharsNWhite TotalWords
## 1 blogs 899288 899288 206824382 170389539 37570839
## 2 news 77259 77259 15639408 13072698 2651432
## 3 twitter 2360148 2360148 162096031 134082634 30451128
Taking in account that the raw data is very huge, sampling is a good idea, before starting the analysis.
set.seed(666)
sample_size <- 0.05
blogs_index <- sample(seq_len(length(blogs)), length(blogs)*sample_size)
news_index <- sample(seq_len(length(news)),length(news)*sample_size)
twitter_index <- sample(seq_len(length(twitter)), length(twitter)*sample_size)
blogs_sample <- blogs[blogs_index[]]
news_sample <- news[news_index[]]
twitter_sample <- twitter[twitter_index[]]
stats_samples <- data.frame(File = c("blogs_sample", "news_sample", "twitter_sample"), t(rbind(sapply(list(blogs_sample, news_sample, twitter_sample), stri_stats_general),TotalWords = sapply(list(blogs_sample, news_sample, twitter_sample),stri_stats_latex)[4,])))
print(stats_samples)
## File Lines LinesNEmpty Chars CharsNWhite TotalWords
## 1 blogs_sample 44964 44964 10349414 8524676 1880991
## 2 news_sample 3862 3862 782265 653851 132679
## 3 twitter_sample 118007 118007 8097873 6698120 1521118
The samples have less data than the original.
blogs.corpus <- corpus(blogs_sample)
news.corpus <- corpus(news_sample)
twitter.corpus <- corpus(twitter_sample)
docvars(blogs.corpus,"type") <- "blog"
docvars(news.corpus, "type") <- "news"
docvars(twitter.corpus, "type") <- "twitter"
corpus <- blogs.corpus + news.corpus + twitter.corpus
feature.matrix <- dfm(corpus, remove_punct = TRUE, remove_numbers= TRUE, remove_symbols= TRUE, stem = TRUE)
textplot_wordcloud(feature.matrix, max.words=30, colors=brewer.pal(6,"Dark2"), scale = c(8, 0.8))
top.features <- topfeatures(feature.matrix, n = 30)
top.features.df <- data.frame(top.features)
top.features.df["unigram"] <- rownames(top.features.df)
ggplot(top.features.df, aes(x = reorder(unigram, -top.features), y = top.features))+geom_col() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Feature") + ylab("Count")
bigram.frequency.threshold <- 10
bigram.top.feature.count <- 30
dfm.bigram <- dfm(corpus, removePunct= TRUE, stem = TRUE, ngrams=2)
## Warning: argument "removePunct" is deprecated: use "remove_punct" instead.
## Warning in dfm.tokenizedTexts(temp, tolower = tolower, stem = stem, select
## = select, : Arguments removePunctngrams not used.
## Warning in dfm.dfm(result, tolower = FALSE, stem = stem, select = select, :
## Arguments removePunctngrams not used.
bigram.frequency <- colSums(dfm.bigram)
bigram.frequency <- sort(bigram.frequency, decreasing = TRUE)
bigram.frequency.pruned <- as.numeric()
for (i in 1:length(bigram.frequency)) {
if (bigram.frequency[i] > bigram.frequency.threshold) {
bigram.frequency.pruned <- c(bigram.frequency.pruned, bigram.frequency[i])
}
}
bigram.df <- data.frame(bigram.frequency.pruned[seq(bigram.top.feature.count)])
names(bigram.df) <- "bigram.count"
bigram.df["bigram"] <- rownames(bigram.df)
ggplot(bigram.df, aes(x = reorder(bigram, -bigram.count), y = bigram.count)) +
geom_bar(position = "identity", stat = "identity", fill = "darkblue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Bigram") +
ylab("Count")
trigram.frequency.threshold <- 10
trigram.top.feature.count <- 30
dfm.trigram <- dfm(corpus, remove_punct= TRUE, stem = TRUE, ngrams = 3)
trigram.frequency <- colSums(dfm.trigram)
trigram.frequency <- sort(trigram.frequency, decreasing = TRUE)
trigram.frequency.pruned <- as.numeric()
for (i in 1:length(trigram.frequency)) {
if (trigram.frequency[i] > trigram.frequency.threshold) {
trigram.frequency.pruned <- c(trigram.frequency.pruned, trigram.frequency[i])
}
}
trigram.df <- data.frame(trigram.frequency.pruned[seq(trigram.top.feature.count)])
names(trigram.df) <- "trigram.count"
trigram.df["trigram"] <- rownames(trigram.df)
ggplot(trigram.df, aes(x = reorder(trigram, -trigram.count), y = trigram.count)) +
geom_bar(position = "identity", stat = "identity", fill = "darkgray") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Trigram") +
ylab("Count")
I was testing the quanteda library to develop a prediction model. However I can observe that this library requiere a lot of resources in time porcessing as size. For example the dfm.trigram = 206.3 Mb.
I must to find and define the best model to the trade-off between accuracy and efficiency (prediction time and RAM used). Indeed, for a smartphone keyboard application, it is pointless to gain 5% accuracy at the cost of a 5 seconds prediction time.