Data Science Capstone-Exploratory Data Analysis

A basic report of summary statistics about the data sets

dir <- c("Coursera-SwiftKey/final/en_US")
filelist <- list.files(dir)
filelistFullPath <- file.path(dir, filelist)
CACHE_FILE <- "basic_report"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(basic_report)) 
{
  basic_report <- lapply(filelistFullPath, function(x) round(file.info(x)[1]/1024/1024))
  basic_report <- as.data.frame(basic_report)
  # Initialize lists for line and word counts
  line_counts <- c()
  word_counts <- c()
  
  # Read first 100 lines and count lines & words in a single pass
  for (file in filelistFullPath) {
    lines <- readLines(file, warn = FALSE)  
    line_counts <- c(line_counts, length(lines))  # Count lines
    word_counts <- c(word_counts, sum(str_count(lines, "\\S+")))  # Count words
  }  

  names(basic_report) <- gsub("\\.txt$", "", filelist)
  basic_report <- rbind(basic_report, line_counts, word_counts)
  basic_report <- as.data.frame(lapply(basic_report, format_number))
  rownames(basic_report) <- c("size(MB)", "Lines", "Words")
  save_objects(c("basic_report"), CACHE_FILE)
}

	en_US.blogs	en_US.news	en_US.twitter
size(MB)	200	196	159
Lines	899,288	77,259	2,360,148
Words	37,334,131	2,643,969	30,373,543

Word frequencies

Some words are more frequent than others - what are the distributions of word frequencies? Read a sample of 10,000 lines from all files

set.seed(12345)
CACHE_FILE <- "text_data"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(text_data)) 
{
  sample_size <- 10000
  # Define function to read a random sample of lines
  read_random_sample <- function(file_path, sample_size = 10000) {
    # Count total lines in the file
    total_lines <- basic_report["Lines", gsub("\\.txt$", "", basename(file_path))] # Get total lines from `basic_report`
    
    # Convert from character to numeric (since `basic_report` stores formatted numbers)
    total_lines <- as.numeric(gsub(",", "", total_lines))
    
    # Select random line numbers
    sampled_lines <- sort(sample(1:total_lines, min(sample_size, total_lines)))
    
    # Read only the sampled lines
    text_data <- readLines(file_path, warn = FALSE)[sampled_lines]
    
    return(text_data)
  }
  
  # Read random lines from all files
  text_data <- unlist(lapply(filelistFullPath, read_random_sample))
  save_objects(c("text_data"), CACHE_FILE)
}

Analyze word frequencies

library(tm) #Text Mining Package
CACHE_FILE <- "corpus"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(corpus)) 
{
  # Create a Corpus
  corpus <- Corpus(VectorSource(text_data))
  
  # Clean Text Data
  corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lowercase
  corpus <- tm_map(corpus, removePunctuation)            # Remove punctuation
  corpus <- tm_map(corpus, removeNumbers)               # Remove numbers
  corpus <- tm_map(corpus, removeWords, stopwords("en")) # Remove stopwords
  corpus <- tm_map(corpus, stripWhitespace)             # Remove extra spaces
  
  # Create Term-Document Matrix
  tdm <- TermDocumentMatrix(corpus)
  word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
  
  save_objects(c("corpus","tdm","word_freq","word_freq_df"), CACHE_FILE)
}

Word Cloud of Most Frequent Words

Bar Plot of Top 20 Words

What are the frequencies of 2-grams and 3-grams in the dataset?

# Convert Corpus to Plain Text
CACHE_FILE <- "ngrams"
loadObjects(CACHE_FILE)
if (!exists("bigram_df") || is.null(bigram_df)) 
{
  text_clean <- sapply(corpus, as.character)
  text_clean <- paste(text_clean, collapse = " ")
  
  nGram <- function (wordsLength) {
    tokenizer <- function(x) {
      unlist(lapply(ngrams(words(x), wordsLength), paste, collapse = " "))
    }
    # Create Term-Document Matrices
    tdm_ngram <- TermDocumentMatrix(VCorpus(VectorSource(text_clean)), control = list(tokenize = tokenizer))
    # Convert Matrices to Data Frames
    ngram_freq <- sort(rowSums(as.matrix(tdm_ngram)), decreasing = TRUE)
    ngram_df <- data.frame(ngram = names(ngram_freq), freq = ngram_freq)
  }
  bigram_df <<- nGram(2)
  trigram_df <<- nGram(3)
  save_objects(c("bigram_df","trigram_df"), CACHE_FILE)
}
drawBars <- function(ds,title) {
  ggplot(head(ds, 20), aes(x = reorder(ngram, -freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(title = title, x = "", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

drawBars(bigram_df,"Top 20 Most Frequent Bigrams")

drawBars(trigram_df,"Top 20 Most Frequent Trigrams")

drawCloud <- function(ds,title) {
  wordcloud(words = ds$ngram, freq = ds$freq, min.freq = 10,
            max.words = 100, colors = brewer.pal(8, "Dark2"), random.order = FALSE)
}

drawCloud(bigram_df)

drawCloud(trigram_df)

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

To determine how many unique words are needed to cover 50% and 90% of all word instances in the dataset, we need to:

We’ll use word frequency table from the corpus.
Sort words by frequency in descending order.
Calculate cumulative word coverage to find the smallest set of words that make up 50% and 90% of total word occurrences.

  library(dplyr)
  # Compute cumulative sum and coverage percentages
  total_words <- sum(word_freq_df$freq)
  # Sort words by frequency and compute cumulative coverage
  word_freq_df <- word_freq_df %>%
    arrange(desc(freq)) %>%  # Ensure sorting before cumulative sum
    mutate(cumulative_freq = cumsum(freq),
           coverage = cumulative_freq / total_words)
  
  # Find word count needed to cover 50% & 90% of instances
  words_50 <- nrow(word_freq_df[word_freq_df$coverage <= 0.50, ])
  words_90 <- nrow(word_freq_df[word_freq_df$coverage <= 0.90, ])

Then to cover
50% we need 1062 words.
90% we need 15900 words.

How do you evaluate how many of the words come from foreign languages?

CACHE_FILE <- "detected_languages"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(detected_languages)) 
{
  library(textcat)
  
  # Detect language of each text sample
  detected_languages <- textcat(text_data)
  
  # Count foreign language occurrences
  detected_languages <- as.data.frame(table(detected_languages))
  save_objects(c("detected_languages"), CACHE_FILE)
}
colnames(detected_languages) <- c("Language", "Count")
rownames(detected_languages) <- NULL
detected_languages <- detected_languages[order(-detected_languages$Count), ]
detected_languages <- head(detected_languages,10)
show_table(detected_languages,FALSE)

Language	Count
english	22215
scots	3764
middle_frisian	800
german	338
catalan	263
danish	247
frisian	243
afrikaans	183
manx	172
rumantsch	111

Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

1. Lemmatization: Reduce Word Variants

Instead of treating “running”, “ran”, and “runs” as separate words, convert them to “run” to reduce dictionary size while keeping meaning.
textstem:lemmatize_words()

2. Phrase Detection (Collocations)!

Some important phrases (“New York”, “data science”) should not be split into separate words. Detecting collocations ensures that frequently co-occurring words are treated as single units.
quanteda:tokens(),textstat_collocations()

3. Word Embeddings (Finding Similar Words)

If a word does not appear in the dictionary, embeddings can find a similar known word. word2vec:read.wordvectors(),nearest_neighbors()

4. Spelling Normalization (Fuzzy Matching)!

Typos and misspellings reduce coverage. Use fuzzy matching to map misspelled words to known words. hunspell:hunspell_suggest()

5. Subword Tokenization (Byte Pair Encoding - BPE)

Instead of storing full words, break words into common subword parts.
tokenizers::tokenize_words()

Data Science Capstone-Exploratory Data Analysis

M.Welt

2025-03-12

Introduction

Data