1. INTRODUCTION

This Milestone Report is prepared for Data Science Capstone, by John Hopkins University (Coursera).

Large databases comprising of text in a target language are commonly used when generating language models for various purposes. The motivation for this report is to:

Demonstrate that the correct data has been successfully loaded.
Create a basic report of summary statistics about the datasets.
Report any interesting findings.
Get feedback on plans for creating a prediction algorithm and Shiny app.

2. THE DATA

For the purpose of this project, we will be exploring the English database from the Swiftkey training dataset linked here.

# load library
library(dplyr)
library(ggplot2)

# load data
data_blog <- readLines("./data/en_US.blogs.txt", encoding = "UTF-8", 
                       warn = FALSE, skipNul = TRUE)
data_news <- readLines("./data/en_US.news.txt", encoding = "UTF-8",
                       warn = FALSE, skipNul = TRUE)
data_twit <- readLines("./data/en_US.twitter.txt", encoding = "UTF-8",
                       warn = FALSE, skipNul = TRUE)

Data Cleaning

Create a function tidy_data() to remove special characters, digits, and multiple white spaces from the loaded datasets.

tidy_data <- function(dataset){
  dataset <- gsub("[[:punct:]]", "", dataset)
  dataset <- gsub("[[:digit:]]", "", dataset)
  dataset <- gsub(" +", " ", dataset)
  return(dataset)
}

Clean the datasets using the function tidy_data() created.

data_blog <- tidy_data(data_blog)
data_news <- tidy_data(data_news)
data_twit <- tidy_data(data_twit)

Basic summary of the datasets.

# number of lines
lines_blog <- length(data_blog)
lines_news <- length(data_news)
lines_twit <- length(data_twit)

# number of words
count_blog <- sum(sapply(strsplit(data_blog, " "), length))
count_news <- sum(sapply(strsplit(data_news, " "), length))
count_twit <- sum(sapply(strsplit(data_twit, " "), length))

# maximum length
maxlength_blog <- max(sapply(data_blog, nchar))
maxlength_news <- max(sapply(data_news, nchar))
maxlength_twit <- max(sapply(data_twit, nchar))

Create summary table.

# summary table
summary_data <- rbind(c("Blogs", lines_blog, count_blog, maxlength_blog),
                      c("News", lines_news, count_news, maxlength_news),
                      c("Twitter", lines_twit, count_twit, maxlength_twit))

colnames(summary_data) <- c("source", "num_lines", "num_words", "max_length")
data.frame(summary_data)

##    source num_lines num_words max_length
## 1   Blogs    899288  36933280      38943
## 2    News     77259   2581232       3278
## 3 Twitter   2360148  29454833        253

3. EXPLORATORY DATA ANALYSIS

Create a function n_gram() to return a list of n-words in the datasets. Due to the large size of the datasets, the default sample size will be set to 500.

n_gram <- function(dataset, n = 1, sample_size = 500){
  # dataset : list of sentences
  # n : size of n-gram
  
  set.seed(0)
  n_gram_list <- c()
  n_length <- min(n, max(sapply(dataset, nchar)))
  sub_dataset <- sample(dataset, min(sample_size, length(dataset)))
  for(sentence in sub_dataset){
    word_list = strsplit(sentence, " ")[[1]]
    if(length(word_list) < n){
      next
    }
    for(word_idx in 1:(length(word_list) - n + 1)){
      n_gram_list = append(n_gram_list, 
                           paste(word_list[word_idx:(word_idx + n - 1)], collapse = " "))
    }
  }
  
  n_gram_list <- n_gram_list %>% 
    data.frame() %>%
    rename(n_gram = ".") %>%
    mutate(n_gram = as.character(n_gram), n_size = n_length) %>%
    group_by(n_gram, n_size) %>%
    summarise(counts = n()) %>%
    arrange(desc(counts))
  
  return(n_gram_list)
}

Create n-grams using the function n_gram() created and take the top num_records.

num_record <- 10

ngram_blog <- n_gram(data_blog) %>% head(num_record)
ngram_news <- n_gram(data_news) %>% head(num_record)
ngram_twit <- n_gram(data_twit) %>% head(num_record)

for(size in 2:4){
  ngram_blog <- rbind(ngram_blog, n_gram(data_blog, size) %>% head(num_record))
  ngram_news <- rbind(ngram_news, n_gram(data_news, size) %>% head(num_record))
  ngram_twit <- rbind(ngram_twit, n_gram(data_twit, size) %>% head(num_record))
}

Combine n-grams into single dataframe.

ngram_blog <- ngram_blog %>%
  mutate(n_src = "blogs")
ngram_news <- ngram_news %>%
  mutate(n_src = "news")
ngram_twit <- ngram_twit %>%
  mutate(n_src = "twitter")
ngram_all <- rbind(ngram_blog, ngram_news, ngram_twit) %>%
  ungroup()

Plot n-grams distribution. Observe that following:

1/2-gram: relatively consistent and low quality information for all 3 sources since concentrated on the use of articles, conjunctions, and connectors
3/4-gram: observe that blogs is descriptive or story-like, news is informative and formal, twitter is personal and very informal

for(size in 1:4){
  print(ngram_all %>% filter(n_size == size) %>%
    ggplot(aes(x = reorder(n_gram, counts), y = counts)) +
    geom_bar(stat = "identity") + 
    labs(title = "Distribution of n-grams across blogs, news, and twitter", 
         subtitle = "Based on top 10 word or phrases for each n-gram",
         x = "", y = "counts") +
    theme(axis.title.y = element_text(angle = 0),
          axis.text.y = element_text(vjust = 0.5, angle = 0)) + 
    coord_flip() +
    facet_grid(n_src ~ n_size, scales = "free"))
}

Milestone Report

EugLee

18 August 2018

1. INTRODUCTION

2. THE DATA

Data Cleaning

3. EXPLORATORY DATA ANALYSIS