Data Science Capstone: Milestone Report

Introduction

This document contains basic summary statistics and also set the foundations to the predictive model algorithm and app development later on.

Packages used

## Loading required package: NLP

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

Downloading the data

directory <- 'D:/02 Coursera/02 R/01 Johns Hopkings-Coursera/10 Capstone'
link <- 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
filename <- 'Coursera-SwiftKey.zip'

##'Create the folder ('10 Capstone') if is not yet created.
if(!file.exists(directory)){
  dir.create(path = directory)
}

##'Check if file has been downloaded if not download it.
if(!file.exists(paste(directory,'/',filename,sep = ''))){
  download.file(url = link, destfile = paste(directory,'/',filename,sep = ''), mode = 'wb')
}

##'Check if the file is already unzipped in the folder, if not unzip it.

if(!file.exists(paste(directory,'/','final',sep = ''))){ #' 'final' is the name of the folder that contains the txt files
  unzip(zipfile = paste(directory,'/',filename,sep = ''), exdir = directory) #' unzip the file in the directory
}

Basic Analysis

In this case we are only focus on the ‘en_US’ folder, so the basics statistics will feature only that. We present a matrix with the size of each file, number of lines, number of words and the median of words of each line.

files <- list.files(path = paste(directory,'/','final','/','en_US',sep = ''), pattern = '.txt', full.names = TRUE)

size_blog <- file.info(files[[1]])$size / (1024^2)
size_news <- file.info(files[[2]])$size / (1024^2)
size_twitter <- file.info(files[[3]])$size / (1024^2)

lines_blog <- readLines(files[[1]])
lines_news <- readLines(files[[2]])

## Warning in readLines(files[[2]]): incomplete final line found on 'D:/02
## Coursera/02 R/01 Johns Hopkings-Coursera/10 Capstone/final/en_US/en_US.news.txt'

lines_twitter <- readLines(files[[3]])

## Warning in readLines(files[[3]]): line 167155 appears to contain an embedded nul

## Warning in readLines(files[[3]]): line 268547 appears to contain an embedded nul

## Warning in readLines(files[[3]]): line 1274086 appears to contain an embedded
## nul

## Warning in readLines(files[[3]]): line 1759032 appears to contain an embedded
## nul

words_blog <- stri_count_words(lines_blog)
words_news <- stri_count_words(lines_news)
words_twitter <- stri_count_words(lines_twitter)

sum_files <- data.frame(INF_SOURCE = c('BLOGS','NEWS','TWITTER'),FILE_SIZE_MB = c(size_blog,size_news,size_twitter),
                       NUM_LINES = c(length(lines_blog),length(lines_news),length(lines_twitter)),
                       NUM_WORDS = c(sum(words_blog),sum(words_news),sum(words_twitter)),
                       MEAN_NUM_WORDS = c(mean(words_blog),mean(words_news),mean(words_twitter)))

print(sum_files)

##   INF_SOURCE FILE_SIZE_MB NUM_LINES NUM_WORDS MEAN_NUM_WORDS
## 1      BLOGS     200.4242    899288  38154238       42.42716
## 2       NEWS     196.2775     77259   2693898       34.86840
## 3    TWITTER     159.3641   2360148  30218125       12.80349

Now we present a histogram for each file. Based on this we can say twitter has way less the words per line, because of the character limit.

p1 <- qplot(words_blog, geom = 'histogram', main = 'US BLOGS', xlab = 'WORDS PER LINE', ylab = 'Frecuency', binwidth = 5)

p2 <- qplot(words_news, geom = 'histogram', main = 'US NEWS', xlab = 'WORDS PER LINE', ylab = 'Frecuency', binwidth = 5)

p3 <- qplot(words_twitter, geom = 'histogram', main = 'US TWITTER', xlab = 'WORDS PER LINE', ylab = 'Frecuency', binwidth = 1)

plotlist <- list(p1,p2,p3) #attaching plots to a list
rm(p1,p2,p3) #removing from memory

do.call(what = grid.arrange, c(plotlist,list(ncol = 1)))

Sampling from raw data & Cleaning.

Since the data is very large we will take a ramdom sample of the data, the size will be 0.004 for each file.

set.seed(2020)
data_sample <- c(sample(lines_blog, length(lines_blog)*0.004),sample(lines_news, length(lines_news)*0.004), # 0.004 was chosen because of memory capacity
                 sample(lines_twitter, length(lines_twitter)*0.004))

corpus <- VCorpus(VectorSource(data_sample))

toSpace <- content_transformer(function(x, pattern) gsub(pattern, ' ', x)) ##cleaning spaces

corpus <- tm_map(corpus, toSpace, '(f|ht)tp(s?)://(.*)[.][a-z]+')

corpus <- tm_map(corpus, toSpace, '@[^\\s]+')

corpus <- tm_map(corpus, tolower) ##lower cases

corpus <- tm_map(corpus, removeWords, stopwords('en'))

corpus <- tm_map(corpus, removePunctuation)

corpus <- tm_map(corpus, removeNumbers)

corpus <- tm_map(corpus, stripWhitespace)

corpus <- tm_map(corpus, PlainTextDocument)

Analysis of the n-grams

In this section we want to know which word, pair and trio of words are more likely to appear. We have selected the first 30.

get_freq <- function(tdm){
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

createplot <- function(df, label){
  ggplot(df[1:30,], aes(reorder(word, -freq), freq)) + #Reorder to obtain first 30
    labs(x = label, y ='FREQUENCY') + 
    theme(axis.text.x = element_text(angle = 50, size = 12, hjust = 1)) +
    geom_bar(stat = 'identity', fill = I('blue'))
}

freq1 <- get_freq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)) 
freq2 <- get_freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- get_freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))

Histogram of the 30 most common unigrams in the sample.

createplot(freq1, '30 MOST COMMON UNI-GRAM')

Histogram of the 30 most common bi-gram in the sample.

createplot(freq2, '30 MOST COMMON BI-GRAM')

Histogram of the 30 most common tri-gram in the sample.

createplot(freq3, '30 MOST COMMON TRI-GRAM')

With this we have a better understandig which words are normally used together.

Plans for building the prediction algorithm

Use the n-grams to generate tokens based on the frequency to build the predictive model and then deploy the model into the app.