The capstone project for Coursera data science specialization is related to text data mining: guessing or sugesting next word which will follow a previous sequence of words. The milestone project part is related to exploratory data analyses.
The files which are used to create the dictionary and build the word prediction are downloaded from HC Corpora (www.corpora.heliohost.org) and were downloade from the next link https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.
The zip file contains files in multiple languages, but for the project I choose the english version. For each language there are three files with data from different sources, styles and format.
Load the files in r and find some basic statistics about the files and their content.
To automate the files statistics summary I wrote a small code in R which will load all the files from the current directory, delete the non-printable characters/words, calculate and display the next files summary information: file name, file size, number of lines, size of the longest line, number of words per file.
library(knitr);
library(NLP);
library(tm);
library(tau);
library(slam);
library(RColorBrewer);
library(wordcloud);
wc_l <- function(x) {length(unlist(strsplit(x," ")))};
files_dir <- c("./final/");
files_list <- list.files(files_dir);
files_info <- file.info(paste(files_dir,files_list,sep=""));
files_stats <- data.frame(FileName=character(),FileSize_Kb=numeric(),
NoOfLines=numeric(), MaxLineSize=numeric(), TotalWords=numeric(),
stringsAsFactors=FALSE);
for (file_name in files_list) {
file_con <- file(paste(files_dir,file_name,sep=""), "rb");
f_lines <- readLines(file_con, encoding="UTF-8", skipNul=TRUE);
f_lines <- gsub("[^[:print:]]","",f_lines);
lines <- length(f_lines);
l_size <- sapply(f_lines,nchar);
max_linesize <- max(l_size);
wc_lines <- sapply(f_lines,wc_l);
total_words <- sum(wc_lines);
files_stats <- rbind(files_stats,cbind(file_name,files_info[paste(files_dir,file_name,sep=""),1],lines,max_linesize,total_words));
close(file_con);
}
names(files_stats) <- c("FileName","FileSize_Kb","NoOfLines","MaxLineSize","TotalWords");
files_stats;
## FileName FileSize_Kb NoOfLines MaxLineSize TotalWords
## 1 en_US.blogs.txt 210160014 899288 40832 37332758
## 2 en_US.news.txt 205811889 1010242 11384 34372090
## 3 en_US.twitter.txt 167105338 2360148 140 30353488
Since the data is too large and to create the document word matrix will use a lot of computer resources and time, the data will be sampled before to create the corpus document. Teh sample size used is 5%.
SampleSize <- 0.05;
for (file_name in files_list) {
file_con <- file(paste(files_dir,file_name,sep=""), "rb");
f_lines <- readLines(file_con, encoding="UTF-8", skipNul=TRUE);
f_sample_lines <- sample(f_lines, length(f_lines)*SampleSize);
f_corpus <- gsub("[^[:print:]]","",f_sample_lines);
doc_vs <- VectorSource(f_corpus);
close(file_con);
}
docs <- Corpus(doc_vs);
rm(f_lines);
rm(f_sample_lines);
rm(f_corpus);
docs <- tm_map(docs, content_transformer(tolower));
docs <- tm_map(docs, removeNumbers);
docs <- tm_map(docs, removePunctuation);
docs <- tm_map(docs, stripWhitespace);
docs <- tm_map(docs, removeWords, stopwords("english"));
In document analyses the statistics related to the frequencies of words and words combinatios (Ngrams) are calculated for teh documents provides. The words statistcs will allow to categorize the future documents, guess the next words or implement word corrections.
tdm_1gram <- TermDocumentMatrix(docs);
tdm_1gram_r <- rollup(tdm_1gram, 2, na.rm=TRUE, FUN = sum);
tdm_1gram_Freq <- sort((as.matrix(tdm_1gram_r)[,1]),decreasing=TRUE);
tdm_1gram_df <- data.frame(word=names(tdm_1gram_Freq), freq=tdm_1gram_Freq, stringsAsFactors = FALSE)
grams2 <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE);
tdm_2grams <- TermDocumentMatrix(docs, control = list(tokenize = grams2, bounds = list(global = c(1,Inf))));
tdm_2gram_r <- rollup(tdm_2grams, 2, na.rm=TRUE, FUN = sum);
tdm_2gram_Freq <- sort((as.matrix(tdm_2gram_r)[,1]),decreasing=TRUE);
tdm_2gram_df <- data.frame(word=names(tdm_2gram_Freq), freq=tdm_2gram_Freq, stringsAsFactors = FALSE);
grams3 <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE);
tdm_3grams <- TermDocumentMatrix(docs, control = list(tokenize = grams3, bounds = list(global = c(1,Inf))));
tdm_3gram_r <- rollup(tdm_3grams, 2, na.rm=TRUE, FUN = sum);
tdm_3gram_Freq <- sort((as.matrix(tdm_3gram_r)[,1]),decreasing=TRUE);
tdm_3gram_df <- data.frame(word=names(tdm_3gram_Freq), freq=tdm_3gram_Freq, stringsAsFactors = FALSE);
print("Top 10 words");
## [1] "Top 10 words"
tdm_1gram_df[1:10,c("freq"),drop=FALSE];
## freq
## just 7368
## like 6042
## get 5493
## love 5326
## good 5008
## will 4749
## day 4519
## can 4515
## thanks 4416
## dont 4364
print("Top 10 ngrams-2");
## [1] "Top 10 ngrams-2"
tdm_2gram_df[1:10,c("freq"),drop=FALSE];
## freq
## cant wait 854
## right now 829
## last night 578
## looking forward 462
## happy birthday 418
## dont know 389
## im going 381
## good morning 365
## feel like 349
## looks like 344
print("Top 10 ngrams-3");
## [1] "Top 10 ngrams-3"
tdm_3gram_df[1:10,c("freq"),drop=FALSE];
## freq
## happy mothers day 179
## cant wait see 172
## let us know 118
## happy new year 89
## cinco de mayo 54
## im pretty sure 49
## dont even know 47
## looking forward seeing 47
## cant wait get 42
## cant wait till 39
barplot(tdm_1gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_1gram_df$word[1:20], col="green", horiz=TRUE, main="Words frequency", las=2);
barplot(tdm_2gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_2gram_df$word[1:20], col="blue", horiz=TRUE, main="2 Ngrams frequency", las=2);
barplot(tdm_3gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_3gram_df$word[1:20], col="red", horiz=TRUE, main="3 Ngrams frequency", las=2);
wc_col <- brewer.pal(8,"Dark2");
wordcloud(tdm_1gram_df$word[1:50], tdm_1gram_df$freq[1:100], random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 1-Ngrams");
wordcloud(tdm_2gram_df$word[1:50], tdm_2gram_df$freq[1:100], scale=c(3,.5), random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 2-Ngrams");
wordcloud(tdm_3gram_df$word[1:50], tdm_3gram_df$freq[1:100], scale=c(3,.5), random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 3-Ngrams");
From the data exploration and analyses for the next word guessing project next steps will be implemented and some parts will require more analyses: