Introduction

This is the Week 2 Milestone report for the Coursera Data Science Capstone Project.

The goal of this report is to
1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
2. Create a basic report of summary statistics about the data sets.
3. Report any interesting findings that you amassed so far.
4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

The predictive model will trained using the document corpus complied from three sources of text data:
- Blogs
- Twitter
- News

The model will only focus on the English language.

A basic report of summary statistics about the data sets

dir <- c("Coursera-SwiftKey/final/en_US")
filelist <- list.files(dir)
filelistFullPath <- file.path(dir, filelist)
CACHE_FILE <- "basic_report"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(basic_report)) 
{
  basic_report <- lapply(filelistFullPath, function(x) round(file.info(x)[1]/1024/1024))
  basic_report <- as.data.frame(basic_report)
  # Initialize lists for line and word counts
  line_counts <- c()
  word_counts <- c()
  
  # Read first 100 lines and count lines & words in a single pass
  for (file in filelistFullPath) {
    lines <- readLines(file, warn = FALSE)  
    line_counts <- c(line_counts, length(lines))  # Count lines
    word_counts <- c(word_counts, sum(str_count(lines, "\\S+")))  # Count words
  }  

  names(basic_report) <- gsub("\\.txt$", "", filelist)
  basic_report <- rbind(basic_report, line_counts, word_counts)
  basic_report <- as.data.frame(lapply(basic_report, format_number))
  rownames(basic_report) <- c("size(MB)", "Lines", "Words")
  save_objects(c("basic_report"), CACHE_FILE)
}
en_US.blogs en_US.news en_US.twitter
size(MB) 200 196 159
Lines 899,288 77,259 2,360,148
Words 37,334,131 2,643,969 30,373,543

Word frequencies

Some words are more frequent than others - what are the distributions of word frequencies? Read a sample of 10,000 lines from all files

set.seed(12345)
CACHE_FILE <- "text_data"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(text_data)) 
{
  sample_size <- 10000
  # Define function to read a random sample of lines
  read_random_sample <- function(file_path, sample_size = 10000) {
    # Count total lines in the file
    total_lines <- basic_report["Lines", gsub("\\.txt$", "", basename(file_path))] # Get total lines from `basic_report`
    
    # Convert from character to numeric (since `basic_report` stores formatted numbers)
    total_lines <- as.numeric(gsub(",", "", total_lines))
    
    # Select random line numbers
    sampled_lines <- sort(sample(1:total_lines, min(sample_size, total_lines)))
    
    # Read only the sampled lines
    text_data <- readLines(file_path, warn = FALSE)[sampled_lines]
    
    return(text_data)
  }
  
  # Read random lines from all files
  text_data <- unlist(lapply(filelistFullPath, read_random_sample))
  save_objects(c("text_data"), CACHE_FILE)
}

Analyze word frequencies

library(tm) #Text Mining Package
CACHE_FILE <- "corpus"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(corpus)) 
{
  # Create a Corpus
  corpus <- Corpus(VectorSource(text_data))
  
  # Clean Text Data
  corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lowercase
  corpus <- tm_map(corpus, removePunctuation)            # Remove punctuation
  corpus <- tm_map(corpus, removeNumbers)               # Remove numbers
  corpus <- tm_map(corpus, removeWords, stopwords("en")) # Remove stopwords
  corpus <- tm_map(corpus, stripWhitespace)             # Remove extra spaces
  
  # Create Term-Document Matrix
  tdm <- TermDocumentMatrix(corpus)
  word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
  
  save_objects(c("corpus","tdm","word_freq","word_freq_df"), CACHE_FILE)
}

Word Cloud of Most Frequent Words

Bar Plot of Top 20 Words

What are the frequencies of 2-grams and 3-grams in the dataset?

# Convert Corpus to Plain Text
CACHE_FILE <- "ngrams"
loadObjects(CACHE_FILE)
if (!exists("bigram_df") || is.null(bigram_df)) 
{
  text_clean <- sapply(corpus, as.character)
  text_clean <- paste(text_clean, collapse = " ")
  
  nGram <- function (wordsLength) {
    tokenizer <- function(x) {
      unlist(lapply(ngrams(words(x), wordsLength), paste, collapse = " "))
    }
    # Create Term-Document Matrices
    tdm_ngram <- TermDocumentMatrix(VCorpus(VectorSource(text_clean)), control = list(tokenize = tokenizer))
    # Convert Matrices to Data Frames
    ngram_freq <- sort(rowSums(as.matrix(tdm_ngram)), decreasing = TRUE)
    ngram_df <- data.frame(ngram = names(ngram_freq), freq = ngram_freq)
  }
  bigram_df <<- nGram(2)
  trigram_df <<- nGram(3)
  save_objects(c("bigram_df","trigram_df"), CACHE_FILE)
}
drawBars <- function(ds,title) {
  ggplot(head(ds, 20), aes(x = reorder(ngram, -freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(title = title, x = "", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

drawBars(bigram_df,"Top 20 Most Frequent Bigrams")

drawBars(trigram_df,"Top 20 Most Frequent Trigrams")

drawCloud <- function(ds,title) {
  wordcloud(words = ds$ngram, freq = ds$freq, min.freq = 10,
            max.words = 100, colors = brewer.pal(8, "Dark2"), random.order = FALSE)
}

drawCloud(bigram_df)

drawCloud(trigram_df)

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

To determine how many unique words are needed to cover 50% and 90% of all word instances in the dataset, we need to:

  • We’ll use word frequency table from the corpus.
  • Sort words by frequency in descending order.
  • Calculate cumulative word coverage to find the smallest set of words that make up 50% and 90% of total word occurrences.
  library(dplyr)
  # Compute cumulative sum and coverage percentages
  total_words <- sum(word_freq_df$freq)
  # Sort words by frequency and compute cumulative coverage
  word_freq_df <- word_freq_df %>%
    arrange(desc(freq)) %>%  # Ensure sorting before cumulative sum
    mutate(cumulative_freq = cumsum(freq),
           coverage = cumulative_freq / total_words)
  
  # Find word count needed to cover 50% & 90% of instances
  words_50 <- nrow(word_freq_df[word_freq_df$coverage <= 0.50, ])
  words_90 <- nrow(word_freq_df[word_freq_df$coverage <= 0.90, ])

Then to cover
50% we need 1062 words.
90% we need 15900 words.

How do you evaluate how many of the words come from foreign languages?

CACHE_FILE <- "detected_languages"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(detected_languages)) 
{
  library(textcat)
  
  # Detect language of each text sample
  detected_languages <- textcat(text_data)
  
  # Count foreign language occurrences
  detected_languages <- as.data.frame(table(detected_languages))
  save_objects(c("detected_languages"), CACHE_FILE)
}
colnames(detected_languages) <- c("Language", "Count")
rownames(detected_languages) <- NULL
detected_languages <- detected_languages[order(-detected_languages$Count), ]
detected_languages <- head(detected_languages,10)
show_table(detected_languages,FALSE)
Language Count
english 22215
scots 3764
middle_frisian 800
german 338
catalan 263
danish 247
frisian 243
afrikaans 183
manx 172
rumantsch 111

Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

1. Lemmatization: Reduce Word Variants

Instead of treating “running”, “ran”, and “runs” as separate words, convert them to “run” to reduce dictionary size while keeping meaning.
textstem:lemmatize_words()

2. Phrase Detection (Collocations)!

Some important phrases (“New York”, “data science”) should not be split into separate words. Detecting collocations ensures that frequently co-occurring words are treated as single units.
quanteda:tokens(),textstat_collocations()

3. Word Embeddings (Finding Similar Words)

If a word does not appear in the dictionary, embeddings can find a similar known word. word2vec:read.wordvectors(),nearest_neighbors()

4. Spelling Normalization (Fuzzy Matching)!

Typos and misspellings reduce coverage. Use fuzzy matching to map misspelled words to known words. hunspell:hunspell_suggest()

5. Subword Tokenization (Byte Pair Encoding - BPE)

Instead of storing full words, break words into common subword parts.
tokenizers::tokenize_words()