This document contains basic summary statistics and also set the foundations to the predictive model algorithm and app development later on.
## Loading required package: NLP
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
directory <- 'D:/02 Coursera/02 R/01 Johns Hopkings-Coursera/10 Capstone'
link <- 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
filename <- 'Coursera-SwiftKey.zip'
##'Create the folder ('10 Capstone') if is not yet created.
if(!file.exists(directory)){
dir.create(path = directory)
}
##'Check if file has been downloaded if not download it.
if(!file.exists(paste(directory,'/',filename,sep = ''))){
download.file(url = link, destfile = paste(directory,'/',filename,sep = ''), mode = 'wb')
}
##'Check if the file is already unzipped in the folder, if not unzip it.
if(!file.exists(paste(directory,'/','final',sep = ''))){ #' 'final' is the name of the folder that contains the txt files
unzip(zipfile = paste(directory,'/',filename,sep = ''), exdir = directory) #' unzip the file in the directory
}
In this case we are only focus on the ‘en_US’ folder, so the basics statistics will feature only that. We present a matrix with the size of each file, number of lines, number of words and the median of words of each line.
files <- list.files(path = paste(directory,'/','final','/','en_US',sep = ''), pattern = '.txt', full.names = TRUE)
size_blog <- file.info(files[[1]])$size / (1024^2)
size_news <- file.info(files[[2]])$size / (1024^2)
size_twitter <- file.info(files[[3]])$size / (1024^2)
lines_blog <- readLines(files[[1]])
lines_news <- readLines(files[[2]])
## Warning in readLines(files[[2]]): incomplete final line found on 'D:/02
## Coursera/02 R/01 Johns Hopkings-Coursera/10 Capstone/final/en_US/en_US.news.txt'
lines_twitter <- readLines(files[[3]])
## Warning in readLines(files[[3]]): line 167155 appears to contain an embedded nul
## Warning in readLines(files[[3]]): line 268547 appears to contain an embedded nul
## Warning in readLines(files[[3]]): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(files[[3]]): line 1759032 appears to contain an embedded
## nul
words_blog <- stri_count_words(lines_blog)
words_news <- stri_count_words(lines_news)
words_twitter <- stri_count_words(lines_twitter)
sum_files <- data.frame(INF_SOURCE = c('BLOGS','NEWS','TWITTER'),FILE_SIZE_MB = c(size_blog,size_news,size_twitter),
NUM_LINES = c(length(lines_blog),length(lines_news),length(lines_twitter)),
NUM_WORDS = c(sum(words_blog),sum(words_news),sum(words_twitter)),
MEAN_NUM_WORDS = c(mean(words_blog),mean(words_news),mean(words_twitter)))
print(sum_files)
## INF_SOURCE FILE_SIZE_MB NUM_LINES NUM_WORDS MEAN_NUM_WORDS
## 1 BLOGS 200.4242 899288 38154238 42.42716
## 2 NEWS 196.2775 77259 2693898 34.86840
## 3 TWITTER 159.3641 2360148 30218125 12.80349
Now we present a histogram for each file. Based on this we can say twitter has way less the words per line, because of the character limit.
p1 <- qplot(words_blog, geom = 'histogram', main = 'US BLOGS', xlab = 'WORDS PER LINE', ylab = 'Frecuency', binwidth = 5)
p2 <- qplot(words_news, geom = 'histogram', main = 'US NEWS', xlab = 'WORDS PER LINE', ylab = 'Frecuency', binwidth = 5)
p3 <- qplot(words_twitter, geom = 'histogram', main = 'US TWITTER', xlab = 'WORDS PER LINE', ylab = 'Frecuency', binwidth = 1)
plotlist <- list(p1,p2,p3) #attaching plots to a list
rm(p1,p2,p3) #removing from memory
do.call(what = grid.arrange, c(plotlist,list(ncol = 1)))
Since the data is very large we will take a ramdom sample of the data, the size will be 0.004 for each file.
set.seed(2020)
data_sample <- c(sample(lines_blog, length(lines_blog)*0.004),sample(lines_news, length(lines_news)*0.004), # 0.004 was chosen because of memory capacity
sample(lines_twitter, length(lines_twitter)*0.004))
corpus <- VCorpus(VectorSource(data_sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, ' ', x)) ##cleaning spaces
corpus <- tm_map(corpus, toSpace, '(f|ht)tp(s?)://(.*)[.][a-z]+')
corpus <- tm_map(corpus, toSpace, '@[^\\s]+')
corpus <- tm_map(corpus, tolower) ##lower cases
corpus <- tm_map(corpus, removeWords, stopwords('en'))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
In this section we want to know which word, pair and trio of words are more likely to appear. We have selected the first 30.
get_freq <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
createplot <- function(df, label){
ggplot(df[1:30,], aes(reorder(word, -freq), freq)) + #Reorder to obtain first 30
labs(x = label, y ='FREQUENCY') +
theme(axis.text.x = element_text(angle = 50, size = 12, hjust = 1)) +
geom_bar(stat = 'identity', fill = I('blue'))
}
freq1 <- get_freq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- get_freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- get_freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
Histogram of the 30 most common unigrams in the sample.
createplot(freq1, '30 MOST COMMON UNI-GRAM')
Histogram of the 30 most common bi-gram in the sample.
createplot(freq2, '30 MOST COMMON BI-GRAM')
Histogram of the 30 most common tri-gram in the sample.
createplot(freq3, '30 MOST COMMON TRI-GRAM')
With this we have a better understandig which words are normally used together.
Use the n-grams to generate tokens based on the frequency to build the predictive model and then deploy the model into the app.