suppressMessages(library(stringi))
suppressMessages(library(ggplot2))
suppressMessages(library(grid))
suppressMessages(library(tm))
suppressMessages(library(SnowballC))
suppressMessages(library(RWeka))
This is the milestone report for Data Science specialization Capstone project. The goal of this report is to investigate provided data, make some exploratory analysis, clean data and prepare data for model building.
At the end of the project predictive model will be build and small app will be also presented.
In this report only English text file will be used. The data was gathered from twitter, blogs and news.
The data was provided by Data Science specialization and their partners from Swiftkey. Data was downloaded from Coursera webpage.
The data was downloaded, saved and unzipped. Downloaded data have the following structure.
cat(system(command = "tree ./Data/final", intern = TRUE), sep = "\n")
## ./Data/final
## ├── de_DE
## │  ├── de_DE.blogs.txt
## │  ├── de_DE.news.txt
## │  └── de_DE.twitter.txt
## ├── en_US
## │  ├── en_US.blogs.txt
## │  ├── en_US.news.txt
## │  └── en_US.twitter.txt
## ├── fi_FI
## │  ├── fi_FI.blogs.txt
## │  ├── fi_FI.news.txt
## │  └── fi_FI.twitter.txt
## └── ru_RU
## ├── ru_RU.blogs.txt
## ├── ru_RU.news.txt
## └── ru_RU.twitter.txt
##
## 4 directories, 12 files
As we can see from the data tree, there are three English data files:
The main statistic about English text data is shown bellow.
On the first column you can see number of lines, on the second - number of words, and on the third - number of bytes.
fl_sz <- system(command = "wc ./Data/final/en_US/*", intern = TRUE)
cat(fl_sz, sep = "\n")
## 899288 37334114 210160014 ./Data/final/en_US/en_US.blogs.txt
## 1010242 34365936 205811889 ./Data/final/en_US/en_US.news.txt
## 2360148 30359804 167105338 ./Data/final/en_US/en_US.twitter.txt
## 4269678 102059854 583077241 total
As we can see from the table above the size of en_US.blogs.txt file is 200.4 Mb and it have 899288 lines, size of en_US.news.txt file is 196.3 Mb and it have 1010242 lines, size of en_US.twitter.txt file is 159.4 Mb and it have 2360148 lines.
All three data sets was read in order to make some exploratory analysis.
blog <- readLines("Data/final/en_US/en_US.blogs.txt", warn = FALSE)
blog <- iconv(blog, to = "utf-8", sub="")
news <- readLines("Data/final/en_US/en_US.news.txt", warn = FALSE)
news <- iconv(news, to = "utf-8", sub = "")
twitter <- readLines("./Data/final/en_US/en_US.twitter.txt", warn = FALSE)
twitter <- iconv(twitter, to = "utf-8", sub = "")
For beginning of exploratory analysis I decided to use functionality of Stringi library.
st <- data.frame(matrix(nrow = 3, ncol = 4),
row.names = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"))
colnames(st) <- c("Lines", "LinesNEmpty", "Chars", "CharsNWhite")
# general statistics
st[1, ] <- stri_stats_general(blog)
st[2, ] <- stri_stats_general(news)
st[3, ] <- stri_stats_general(twitter)
# words per line
blog_words <- stri_count_words(blog)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
st
## Lines LinesNEmpty Chars CharsNWhite
## en_US.blogs.txt 899288 899288 206824382 170389539
## en_US.news.txt 1010242 1010242 203223154 169860866
## en_US.twitter.txt 2360148 2360148 162096031 134082634
As we can see from the table above twitter data set have the most lines, but the news and blog data sets have the most characters.
Another important feature of the data is - that there are a lines which have only one or zero word. Blogs data set have 3 lines with 0 words and 10636 lines with 1 word. News data set have 3377 lines with 1 word and Twitter data set have 463 lines with 1 word.
For future analysis I have removed all this lines.
blog <- blog[! (blog_words == 0 | blog_words == 1)]
news <- news[!news_words == 1]
twitter <- twitter[!twitter_words == 1]
Now we can look to the main statistic of all data sets in the terms of word count.
blog_words <- stri_count_words(blog)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
psych::describe(blog_words)
## vars n mean sd median trimmed mad min max range skew
## 1 1 888649 42.24 46.65 29 34.49 32.62 2 6726 6724 9.34
## kurtosis se
## 1 887.82 0.05
psych::describe(news_words)
## vars n mean sd median trimmed mad min max range skew
## 1 1 1006865 34.52 22.78 32 32.38 19.27 2 1796 1794 2.65
## kurtosis se
## 1 67.6 0.02
psych::describe(twitter_words)
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2359685 12.75 6.91 12 12.43 8.9 2 47 45 0.34 -0.9
## se
## 1 0
From statistic we can see that, as expected, twitter data set have the shortest entries (in average - 12.75 words per entry) and the largest entries have blog data set (in average - 42.24 words per entry). Blogs and news entries also have biggest standard deviation and skweines.
For better understanding how skew data are, please look to figure 1.
f1_blog <- qplot(x = factor(1),
y = blog_words,
geom = "boxplot") +
xlab("No. of words in Blog") + ylab("")
f1_news <- qplot(x = factor(1),
y = news_words,
geom = "boxplot") +
xlab("No. of words in News") + ylab("")
f1_twitter <- qplot(x = factor(1),
y = twitter_words,
geom = "boxplot") +
xlab("No. of words in Twitter") + ylab("")
multiplot(f1_blog, f1_news, f1_twitter, cols = 3)
For better understanding of the data we will ignore data, with are more that 4 standard deviation from the mean. Please look at plot 2.
f1_blog <- qplot(x = factor(1),
y = subset(x = blog_words,
subset = blog_words < (mean(blog_words) + 4 * sd(blog_words))),
geom = "boxplot") +
xlab("No. of words in Blog") + ylab("")
f1_news <- qplot(x = factor(1),
y = subset(x = news_words,
subset = news_words < (mean(news_words) + 4 * sd(news_words))),
geom = "boxplot") +
xlab("No. of words in News") + ylab("")
f1_twitter <- qplot(x = factor(1),
y = subset(x = twitter_words,
subset = twitter_words < (mean(twitter_words) + 4 * sd(twitter_words))),
geom = "boxplot") +
xlab("No. of words in Twitter") + ylab("")
multiplot(f1_blog, f1_news, f1_twitter, cols = 3)
Because of the size of files I have decided to take only 2% of each file.
# sampling
set.seed(1)
blog_small <- sample(blog, length(blog)*0.02)
news_small <- sample(news, length(news)*0.02)
twitter_small <- sample(twitter, length(twitter)*0.02)
# removing big files
rm(blog, news, twitter, blog_words, news_words, twitter_words); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 656917 35.1 4547696 242.9 4780581 255.4
## Vcells 6588199 50.3 136787419 1043.7 128306209 978.9
After creating small data sets I have used a functionality of tm library
and then clean the data as follows:
- removed hash tags;
- remove decimals (separated by . and ,);
- removed numbers;
- removed punctuation;
- extra white space replaced with a single space;
- converted all to lowercase.
rm_dec <- function(x) {gsub("([0-9]*)\\.|\\,([0-9]+)", "\\1 \\2", x)}
rm_hash <- function(x){gsub("#[0-9a-zA-Z]+", " ", x)}
all_small <- c(blog_small, news_small, twitter_small)
all_corpus <- data.frame(all_small, stringsAsFactors = FALSE)
all_corpus <- DataframeSource(all_corpus)
all_corpus <- VCorpus(all_corpus)
all_corpus <- tm_map(all_corpus, content_transformer(rm_hash))
all_corpus <- tm_map(all_corpus, content_transformer(rm_dec))
all_corpus <- tm_map(all_corpus, stripWhitespace, lazy = TRUE)
all_corpus <- tm_map(all_corpus, content_transformer(tolower), lazy=TRUE)
all_corpus <- tm_map(all_corpus, removePunctuation,lazy = TRUE)
all_corpus <- tm_map(all_corpus, removeNumbers, lazy = TRUE)
all_corpus <- tm_map(all_corpus,
content_transformer(function(x) iconv(x, to='UTF-8', sub='byte')),
mc.cores=1)
To make a better understanding of my data I created tokens. A token is a structure representing a lexeme that explicitly indicates its categorization for the purpose of parsing (Wikipedia - Lexical analysis). For my analysis I have created N-Grams. An n-gram is a contiguous sequence of n items from a given sequence of text or speech (Wikipedia - n-gram).
I have decided to create:
- unigram (n-gram of size 1). This is just frequency of the words;
- bigram (n-gram of size 2). This is just frequency of two words followed by each other;
- trigram (n-gram of size 3);
- quadrigram (n-gram of size 4);
onegramToken <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
bigramToken <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigramToken <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
quadrigramToken <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
# calculating text document matrix and n_grams
options(mc.cores=1)
tdm_one <- TermDocumentMatrix(all_corpus, control = list(removePunctuation = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removeSparseTerms = 0.8,
tokenize = onegramToken))
options(mc.cores=1)
tdm_bi <- TermDocumentMatrix(all_corpus, control = list(removePunctuation = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removeSparseTerms = 0.8,
tokenize = bigramToken))
options(mc.cores=1)
tdm_tri <- TermDocumentMatrix(all_corpus, control = list(removePunctuation = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removeSparseTerms = 0.8,
tokenize = trigramToken))
options(mc.cores=1)
tdm_four <- TermDocumentMatrix(all_corpus, control = list(removePunctuation = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removeSparseTerms = 0.8,
tokenize = quadrigramToken))
Next we can see the most popular N_grams.
# Unigrams, which occurs at leaset 3000 times
findFreqTerms(x = tdm_one, lowfreq=3000)
## [1] "can" "day" "dont" "get" "good" "just" "know"
## [8] "like" "love" "new" "now" "one" "people" "said"
## [15] "time" "will"
# Bigrams, which occurs at leaset 2000 times
findFreqTerms(x = tdm_bi, lowfreq=2000)
## [1] "and the" "at the" "for the" "in a" "in the" "is a"
## [7] "of the" "on the" "to be" "to the" "with the"
# Trigram, which occurs at leaset 250 times
findFreqTerms(x = tdm_tri, lowfreq=250)
## [1] "a lot of" "as well as" "be able to" "going to be"
## [5] "it was a" "i want to" "one of the" "out of the"
## [9] "part of the" "some of the" "thanks for the" "the end of"
## [13] "to be a"
# Trigram, which occurs at leaset 75 times
findFreqTerms(x = tdm_four, lowfreq=75)
## [1] "at the end of" "at the same time"
## [3] "for the first time" "is one of the"
## [5] "one of the most" "thanks for the follow"
## [7] "the end of the" "the rest of the"
## [9] "to be able to" "when it comes to"
Lastly I have plotted mostly common N-grams.
# function for collecting frequencies
freq_df <- function(name){
switch (name,
one = {t_data <- tdm_one; freq <- 3000; cn <- "Unigram"},
two = {t_data <- tdm_bi; freq <- 2000; cn <- "Bigram"},
three = {t_data <- tdm_tri; freq <- 250; cn <- "Trigram"},
four = {t_data <- tdm_four; freq <- 75; cn <- "Quadrigram"},
{print("something wrong"); stop()}
)
temp_matrix <- as.matrix(t_data[findFreqTerms(x = t_data, lowfreq=freq), ])
temp_freq_sum <- rowSums(temp_matrix)
temp_freq_results <- data.frame(findFreqTerms(x = t_data, lowfreq=freq),
temp_freq_sum,
stringsAsFactors = FALSE)
colnames(temp_freq_results) <- c(cn, "Frequency")
rownames(temp_freq_results) <- NULL
rm(t_data, temp_matrix, temp_freq_sum)
return(temp_freq_results[order(-temp_freq_results[, 2]), ])
}
plot_func <- function(name){
pl_data <- freq_df(name)
X <- colnames(pl_data)[1]
Y <- colnames(pl_data)[2]
fig <- ggplot(data = pl_data, aes_string(x = X, y = Y)) +
geom_bar(stat = "identity") +
scale_x_discrete(limits = as.character(pl_data[ , 1])) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
return(fig)
}
plot_func("one")
plot_func("two")
plot_func("three")
plot_func("four")