This paper is a milestone report of the capstone Project for the Data Science Specialization of Johns Hopkins University on Coursera. The objective of this project is to create an algorithm for predicting the next words of a user input (for example in text messaging) and implementing it in an app. Therefore a dataset (provided by Swiftkey) consisting of news articles, blog entries and twitter messages is analysed using Natural Language Processing (NLP). This report shows the retrieval and preprocessing of the data. It presents results of basic exploratory analysis. Also future plans for creating the prediction algorithm are described.
The data can be downloaded from the following link. (Some of the code lines are commented out with “#” as otherwise these would take significant time when compiling this report.)
dest_file<-"Coursera-SwiftKey.zip"
url<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
#download.file(url, dest_file)
Next, the data is extracted from the .zip-File
#unzip(dest_file)
The Data consists of entries from blogs, news and twiiter in 4 different languages (Englisch, German, Finnish and Russian). In this paper only the english texts are considered.
#en_US.blogs<-readLines("final/en_US/en_US.blogs.txt")
#en_US.news<-readLines("final/en_US/en_US.news.txt")
#en_US.twitter<-readLines("final/en_US/en_US.twitter.txt")
library(stringi)
In the next code blocks general statistics about the word counts and sizes of the files are summarized.
#Counting the words of each line
en_US.blogs.words <- stri_count_words(en_US.blogs)
en_US.news.words<- stri_count_words(en_US.news)
en_US.twitter.words <- stri_count_words(en_US.twitter)
en_US.blogs.size <- file.info("final/en_US/en_US.blogs.txt")$size/1024^2
en_US.news.size<- file.info("final/en_US/en_US.news.txt")$size/1024^2
en_US.twitter.size <- file.info("final/en_US/en_US.twitter.txt")$size/1024^2
data.frame(filename = c("en_US.blogs","en_US.news","en_US.twitter"),
size_in_MB = c(
en_US.blogs.size,
en_US.news.size,
en_US.twitter.size
),
nr_of_lines = c(
length(en_US.blogs),
length(en_US.news),
length(en_US.twitter)
),
nr_of_words = c(
sum(en_US.blogs.words),
sum(en_US.news.words),
sum(en_US.twitter.words)),
mean_nr_of_words = c(
mean(en_US.blogs.words),
mean(en_US.news.words),
mean(en_US.twitter.words)
)
)
## filename size_in_MB nr_of_lines nr_of_words mean_nr_of_words
## 1 en_US.blogs 200.4242 899288 38154238 42.42716
## 2 en_US.news 196.2775 77259 2693898 34.86840
## 3 en_US.twitter 159.3641 2360148 30218125 12.80349
The following tables and histograms show the distribution of word count per line. (Some histograms are cut off on the x-axis due to outliers.)
summary(en_US.blogs.words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 42.43 61.00 6726.00
summary(en_US.news.words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.87 46.00 1123.00
summary(en_US.twitter.words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 7.0 12.0 12.8 18.0 60.0
par(mfrow=c(1,3))
hist(en_US.blogs.words[en_US.blogs.words<=300],col="#DDAAAA",main="Word count in \n en_US.blogs",xlab="Number of words")
hist(en_US.news.words[en_US.news.words<=150],col="#AADDAA",main="Word count in \n en_US.news",xlab="Number of words")
hist(en_US.twitter.words,col="#AAAADD",main="Word count in \n en_US.twitter",xlab="Number of words")
For demonstration purposes only 1% of the data is used in this paper.
set.seed(1)
en_US.blogs.sample <- sample(en_US.blogs, length(en_US.blogs)*0.01)
en_US.news.sample <- sample(en_US.news, length(en_US.news)*0.01)
en_US.twitter.sample <- sample(en_US.twitter, length(en_US.twitter)*0.01)
twitterSample <- sapply(en_US.twitter.sample, function(row) iconv(row, "latin1", "ASCII", sub=""))
library(tm)
library(RWeka)
For each of the files a text corpus is generated. It can then be used to analyse the word combinations in the resprective files. To clean the data beforehand, the text is transformed to lower case and punctuation as well as numbers are removed.
newCorp<-function(data){
corp<-Corpus(VectorSource(list(data)))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, removeNumbers)
corp
}
en_us.blogs.corp<-newCorp(en_US.blogs.sample)
en_us.news.corp<-newCorp(en_US.news.sample)
en_us.twitter.corp<-newCorp(en_US.twitter.sample)
For calculating the n-gram frequencies a function is declared. An n-gram is a sequence of n words.
getNgrams<-function(corp,n){
tok <- function(inp) NGramTokenizer(inp, Weka_control(min=n, max=n))
dtm <- DocumentTermMatrix(corp, control=list(tokenize=tok))
df <- data.frame(word=dtm$dimnames$Terms, freq=dtm$v)
df <- df[order(-df$freq), ]
df
}
library(ggplot2)
The following function can plot the n-gram frequencies.
plotNgrams<-function(df,n=1, filename ="data",color="#DDAAAA"){
ggplot(head(df, 15), aes(x=reorder(word,freq), y=freq)) +
geom_bar(stat="identity", fill=color) + coord_flip() +
ggtitle(paste0("Top 15 ", n, "-grams \nin ", filename)) +
xlab("") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, vjust=0)) +
scale_y_continuous(expand = c(0,0)) +
geom_text(aes(label=freq), size=3, hjust=1, vjust=0, angle=0)
}
library(gridExtra)
Next, the frequencies of the n-grams with n= 1 to 4 (unigrams, bigrams, trigrams and quadgrams) are calculated for each file and the 15 most frequent are plotted.
en_us.blogs.unigrams <- getNgrams(en_us.blogs.corp,1)
en_us.news.unigrams <- getNgrams(en_us.news.corp,1)
en_us.twitter.unigrams <- getNgrams(en_us.twitter.corp,1)
grid.arrange(
plotNgrams(en_us.blogs.unigrams,1,"en_us.blogs","#DDAAAA"),
plotNgrams(en_us.news.unigrams,1,"en_us.news","#AADDAA"),
plotNgrams(en_us.twitter.unigrams,1,"en_us.twitter","#AAAADD"),
ncol=3)
As the code is the same as before (only the 1s have to be replaced by 2s, 3s or 4s) the code blocks are hidden in the following sections.