This Milestone Report is prepared for Data Science Capstone, by John Hopkins University (Coursera).
Large databases comprising of text in a target language are commonly used when generating language models for various purposes. The motivation for this report is to:
For the purpose of this project, we will be exploring the English database from the Swiftkey training dataset linked here.
# load library
library(dplyr)
library(ggplot2)
# load data
data_blog <- readLines("./data/en_US.blogs.txt", encoding = "UTF-8",
warn = FALSE, skipNul = TRUE)
data_news <- readLines("./data/en_US.news.txt", encoding = "UTF-8",
warn = FALSE, skipNul = TRUE)
data_twit <- readLines("./data/en_US.twitter.txt", encoding = "UTF-8",
warn = FALSE, skipNul = TRUE)
Create a function tidy_data() to remove special characters, digits, and multiple white spaces from the loaded datasets.
tidy_data <- function(dataset){
dataset <- gsub("[[:punct:]]", "", dataset)
dataset <- gsub("[[:digit:]]", "", dataset)
dataset <- gsub(" +", " ", dataset)
return(dataset)
}
Clean the datasets using the function tidy_data() created.
data_blog <- tidy_data(data_blog)
data_news <- tidy_data(data_news)
data_twit <- tidy_data(data_twit)
Basic summary of the datasets.
# number of lines
lines_blog <- length(data_blog)
lines_news <- length(data_news)
lines_twit <- length(data_twit)
# number of words
count_blog <- sum(sapply(strsplit(data_blog, " "), length))
count_news <- sum(sapply(strsplit(data_news, " "), length))
count_twit <- sum(sapply(strsplit(data_twit, " "), length))
# maximum length
maxlength_blog <- max(sapply(data_blog, nchar))
maxlength_news <- max(sapply(data_news, nchar))
maxlength_twit <- max(sapply(data_twit, nchar))
Create summary table.
# summary table
summary_data <- rbind(c("Blogs", lines_blog, count_blog, maxlength_blog),
c("News", lines_news, count_news, maxlength_news),
c("Twitter", lines_twit, count_twit, maxlength_twit))
colnames(summary_data) <- c("source", "num_lines", "num_words", "max_length")
data.frame(summary_data)
## source num_lines num_words max_length
## 1 Blogs 899288 36933280 38943
## 2 News 77259 2581232 3278
## 3 Twitter 2360148 29454833 253
Create a function n_gram() to return a list of n-words in the datasets. Due to the large size of the datasets, the default sample size will be set to 500.
n_gram <- function(dataset, n = 1, sample_size = 500){
# dataset : list of sentences
# n : size of n-gram
set.seed(0)
n_gram_list <- c()
n_length <- min(n, max(sapply(dataset, nchar)))
sub_dataset <- sample(dataset, min(sample_size, length(dataset)))
for(sentence in sub_dataset){
word_list = strsplit(sentence, " ")[[1]]
if(length(word_list) < n){
next
}
for(word_idx in 1:(length(word_list) - n + 1)){
n_gram_list = append(n_gram_list,
paste(word_list[word_idx:(word_idx + n - 1)], collapse = " "))
}
}
n_gram_list <- n_gram_list %>%
data.frame() %>%
rename(n_gram = ".") %>%
mutate(n_gram = as.character(n_gram), n_size = n_length) %>%
group_by(n_gram, n_size) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
return(n_gram_list)
}
Create n-grams using the function n_gram() created and take the top num_records.
num_record <- 10
ngram_blog <- n_gram(data_blog) %>% head(num_record)
ngram_news <- n_gram(data_news) %>% head(num_record)
ngram_twit <- n_gram(data_twit) %>% head(num_record)
for(size in 2:4){
ngram_blog <- rbind(ngram_blog, n_gram(data_blog, size) %>% head(num_record))
ngram_news <- rbind(ngram_news, n_gram(data_news, size) %>% head(num_record))
ngram_twit <- rbind(ngram_twit, n_gram(data_twit, size) %>% head(num_record))
}
Combine n-grams into single dataframe.
ngram_blog <- ngram_blog %>%
mutate(n_src = "blogs")
ngram_news <- ngram_news %>%
mutate(n_src = "news")
ngram_twit <- ngram_twit %>%
mutate(n_src = "twitter")
ngram_all <- rbind(ngram_blog, ngram_news, ngram_twit) %>%
ungroup()
Plot n-grams distribution. Observe that following:
for(size in 1:4){
print(ngram_all %>% filter(n_size == size) %>%
ggplot(aes(x = reorder(n_gram, counts), y = counts)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of n-grams across blogs, news, and twitter",
subtitle = "Based on top 10 word or phrases for each n-gram",
x = "", y = "counts") +
theme(axis.title.y = element_text(angle = 0),
axis.text.y = element_text(vjust = 0.5, angle = 0)) +
coord_flip() +
facet_grid(n_src ~ n_size, scales = "free"))
}