Data Science Capstone Peer-graded Assignment: Milestone Report

Introduction

The goal of the capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis.

Loading (Blogs, News, Twitter)

##setwd("D:/Coursera/Coursera-SwiftKey/final/en_US")

blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Sampling

library(stringi) # stats files
# Size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("en_US.news.txt")$size  / 1024^2 # Megabytes
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 # Megabytes

# Number of Lines num.lines
len_blogs <- length(blogs) # 899,288 lines
len_news <- length(news)  # 1,010,242 lines
len_twitter <- length(twitter) # 2,360,148

# Number of characters
nchar_blogs <- sum(nchar(blogs))
nchar_news <- sum(nchar(news))
nchar_twitter <- sum(nchar(twitter))

# Counting the Words (num.words)
nword_blogs <- sum(stri_count_words(blogs)) # words at blogs = 37,546,246
nword_news <- sum(stri_count_words(news))  # words at news =  34,762,395
nword_twitter <-sum(stri_count_words(twitter)) # words at twitter = 30,093,410

# create table 
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))

##   file.name files.size.MB num.lines num.character num.words
## 1     blogs      200.4242    899288     206824505  37546250
## 2      news      196.2775   1010242     203223159  34762395
## 3   twitter      159.3641   2360148     162096031  30093372

Sampling

Remove all non-English characters and then compile a sample dataset that is composed of 1% of each of the 3 original datasets.

set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")

# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.01),
               sample(news1,length(news1)*0.01),
               sample(twitter1,length(twitter1)*0.01))

Since Data sets is too big for processing, so using sample() function, I sample 1% of each file.

Clean and Build Corpus

library(tm) # Text mining

## Loading required package: NLP

library(NLP)

corpus <- VCorpus(VectorSource(sample_data))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) # Convert to lowercase
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))

Build N-Grams

In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.

The following function is used to extract 1-grams, 2-grams, 3-grams from the text Corpus using quanteda.

library(quanteda)

## Package version: 2.0.1

## Parallel computing: 2 of 32 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-

## The following object is masked from 'package:utils':
## 
##     View

# Create a corpus from the sample data
corpus <- corpus(sample_data)

# Tokenize and create n-grams
tokens <- tokens(corpus, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
tokens <- tokens_remove(tokens, stopwords("english"))
tokens <- tokens_tolower(tokens)

# Create n-gram functions
create_ngrams <- function(x, n) {
  tokens_ngrams(x, n = n, concatenator = " ")
}

# Generate n-grams
unigrams <- create_ngrams(tokens, 1)
bigrams <- create_ngrams(tokens, 2)
trigrams <- create_ngrams(tokens, 3)

# Create frequency tables
unigram_freq <- dfm(unigrams) %>% topfeatures(n = 1000)
bigram_freq <- dfm(bigrams) %>% topfeatures(n = 80)
trigram_freq <- dfm(trigrams) %>% topfeatures(n = 10)

# Convert to data frames
one_corpus_sort <- data.frame(Word = names(unigram_freq), frequency = unigram_freq)
two_corpus_sort <- data.frame(Word = names(bigram_freq), frequency = bigram_freq)
thr_corpus_sort <- data.frame(Word = names(trigram_freq), frequency = trigram_freq)

# Display top results
head(one_corpus_sort)

##      Word frequency
## just just      3099
## said said      3051
## one   one      2816
## like like      2683
## can   can      2463
## get   get      2353

head(two_corpus_sort)

##                    Word frequency
## right now     right now       247
## last year     last year       192
## new york       new york       179
## last night   last night       160
## feel like     feel like       145
## high school high school       143

head(thr_corpus_sort)

##                                      Word frequency
## call call call             call call call        23
## new york city               new york city        21
## happy new year             happy new year        21
## let us know                   let us know        20
## happy mothers day       happy mothers day        19
## italy lakes holidays italy lakes holidays        18

Exploratory Analysis (Graphs & Visualizations)

The frequency distribution of each n-grams category were visualized into 3 different bar plots.

library(ggplot2) #visualization

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g

two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g

thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g

Conclusion & Next Steps

Ideas: Use a text input box as the user interface of the Shiny app.

Next Steps: 1.Build a predictive algorithm 2.Build a a Shiny app, that suggest the most likely next word after a phrase is typed 3.Prepare a pitch about the app and publish it at “shinyapps.io” server.

Data Science Capstone Peer-graded Assignment: Milestone Report

Kavita Vellala

Jan 31, 2025

Introduction

Loading (Blogs, News, Twitter)

Sampling

Sampling

Clean and Build Corpus

Build N-Grams

Exploratory Analysis (Graphs & Visualizations)

Conclusion & Next Steps