In this capstone we will be applying data science in the area of natural language processing.
The training data that will be the basis for the capstone is located here: Capstone Dataset
This report will only look at the English language datasets.
This report contains the following:
An exploratory analysis of the training data set. Basic summaries of the three files: Word counts, line counts and basic data tables. Basic plots, such as histograms, to illustrate features of the data.
#Set working directory
setwd("C:/Users/lisa.mccormick/Documents/Coursera materials/Data Science Capstone")
# Loading Libraries
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(tm)
## Loading required package: NLP
library(stringi)
library(RWeka)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(SnowballC)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
# Setting up doParallel
set.seed(123)
n_cores <- detectCores() - 2
registerDoParallel(n_cores,cores=n_cores)
# Show files used
directory_us <- file.path(".", "Coursera-SwiftKey", "final", "en_US/")
dir(directory_us)
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
These are very large files so instead of loading the whole files into R we can just open a connection and read the file and close the connection again.
#Loading Files and show summaries
blogs_con <- file(paste0(directory_us, "/en_US.blogs.txt"), "r")
blogs <- readLines(blogs_con, encoding="UTF-8", skipNul = TRUE)
close(blogs_con)
news_con <- file(paste0(directory_us, "/en_US.news.txt"), "r")
news <- readLines(news_con, encoding="UTF-8", skipNul = TRUE)
## Warning in readLines(news_con, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on './Coursera-SwiftKey/final/en_US/en_US.news.txt'
close(news_con)
twitter_con <- file(paste0(directory_us, "/en_US.twitter.txt"), "r")
twitter <- readLines(twitter_con, encoding="UTF-8", skipNul = TRUE)
close(twitter_con)
# Cout lines, words and characters
WPL <- sapply(list(blogs,news,twitter),function(x)
summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
rawstats <- data.frame(
File = c("blogs","news","twitter"),
t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
TotalWords = sapply(list(blogs,news,twitter),stri_stats_latex)[4,],
WPL))
)
# Show in a table
kable(rawstats) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| File | Lines | LinesNEmpty | Chars | CharsNWhite | TotalWords | WPL_Min | WPL_Mean | WPL_Max |
|---|---|---|---|---|---|---|---|---|
| blogs | 899288 | 899288 | 206824382 | 170389539 | 37570839 | 0 | 41.75170 | 6726 |
| news | 77259 | 77259 | 15639408 | 13072698 | 2651432 | 1 | 34.61812 | 1126 |
| 2360148 | 2360148 | 162096241 | 134082806 | 30451170 | 1 | 12.75203 | 47 |
The blogs, news, and twitter files all have different numbers of lines, words, and characters. Twitter obviously has the lowest mean number of words per line.
The files are quite large so we will subset the data by sampling 1% of the data at random and save it in a file we can access without having to reload the original datasets.
# Sample of dataset
set.seed(456)
data.sample <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
saveRDS(data.sample, 'sample.rds')
# Remove the files we do not need anymore
rm(blogs, blogs_con, data.sample, directory_us, news, news_con, rawstats, twitter,
twitter_con, WPL)
The process involves removing all non-ASCII character data, punctuation marks, excess white space, numeric data, converting the remaining alpha characters to lower case, removing stopwords, stemming the document, and generating the entire corpus in plain text.
# Load the RDS file
data <- readRDS("sample.rds")
# Create a Corpus
docs <- VCorpus(VectorSource(data))
# Remove data we do not need
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
# Do stemming
docs <- tm_map(docs, stemDocument)
# Strip whitespaces
docs <- tm_map(docs, stripWhitespace)
# Create plain text format
docs <- tm_map(docs, PlainTextDocument)
From Wikipedia, the free encyclopedia:
“An n-gram is a sequence of n adjacent symbols in particular order. The symbols may be n adjacent letters (including punctuation marks and blanks), syllables, or rarely whole words found in a language dataset; or adjacent phonemes extracted from a speech-recording dataset, or adjacent base pairs extracted from a genome. They are collected from a text corpus or speech corpus.”
We need to use tokenization to make the unigrams (1 word string), bigrams (2 word string), and trigrams (3 word string).
# Create Tokenization funtions
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
Now we can look at frequency tables and histograms of the n-grams in order to determine which words and word pairs are most frequent in our subset.
# Create TermDocumentMatrix with Tokenizations and Remove Sparse Terms
tdm_freq1 <- removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = unigram)), 0.9999)
tdm_freq2 <- removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = bigram)), 0.9999)
tdm_freq3 <- removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = trigram)), 0.9999)
# Create frequencies
uni_freq <- sort(rowSums(as.matrix(tdm_freq1)), decreasing=TRUE)
bi_freq <- sort(rowSums(as.matrix(tdm_freq2)), decreasing=TRUE)
tri_freq <- sort(rowSums(as.matrix(tdm_freq3)), decreasing=TRUE)
# Create DataFrames
uni_df <- data.frame(term=names(uni_freq), freq=uni_freq)
bi_df <- data.frame(term=names(bi_freq), freq=bi_freq)
tri_df <- data.frame(term=names(tri_freq), freq=tri_freq)
# Show head 10 of unigrams
head(uni_df, 10)
## term freq
## just just 2538
## like like 2447
## get get 2441
## will will 2237
## one one 2198
## can can 1941
## time time 1934
## love love 1805
## day day 1687
## make make 1623
# Plot head 15 of unigrams
head(uni_df,15) %>%
ggplot(aes(reorder(term,-freq), freq)) +
geom_bar(stat = "identity") +
ggtitle("20 Most Frequent Unigrams") +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5))
# Show head 10 of bigrams
head(bi_df,10)
## term freq
## right now right now 207
## cant wait cant wait 174
## feel like feel like 154
## last night last night 146
## look like look like 145
## look forward look forward 134
## thank follow thank follow 130
## dont know dont know 126
## im go im go 123
## can get can get 94
# Plot head 15 of bigrams
head(bi_df,15) %>%
ggplot(aes(reorder(term,-freq), freq)) +
geom_bar(stat = "identity") +
ggtitle("20 Most Frequent Bigrams") +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5))
# Show head 10 of trigrams
head(tri_df,10)
## term freq
## cant wait see cant wait see 35
## happi mother day happi mother day 27
## happi new year happi new year 23
## let us know let us know 21
## look forward see look forward see 20
## im pretti sure im pretti sure 17
## just let know just let know 12
## new york time new york time 12
## dont even know dont even know 11
## want make sure want make sure 11
# Plot head 15 of trigrams
head(tri_df,15) %>%
ggplot(aes(reorder(term,-freq), freq)) +
geom_bar(stat = "identity") +
ggtitle("20 Most Frequent Trigrams") +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5))
Create a prediction algorithm using these n-gram tokenizations. Create a shiny app to utilize this algorithm.