This is the Week 2 Milestone report for the Coursera Data Science Capstone Project.
The goal of this report is to
1. Demonstrate that you’ve downloaded the data and have successfully
loaded it in.
2. Create a basic report of summary statistics about the data
sets.
3. Report any interesting findings that you amassed so far.
4. Get feedback on your plans for creating a prediction algorithm and
Shiny app.
The predictive model will trained using the document corpus complied
from three sources of text data:
- Blogs
- Twitter
- News
The model will only focus on the English language.
The data was downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
dir <- c("Coursera-SwiftKey/final/en_US")
filelist <- list.files(dir)
filelistFullPath <- file.path(dir, filelist)
CACHE_FILE <- "basic_report"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(basic_report))
{
basic_report <- lapply(filelistFullPath, function(x) round(file.info(x)[1]/1024/1024))
basic_report <- as.data.frame(basic_report)
# Initialize lists for line and word counts
line_counts <- c()
word_counts <- c()
# Read first 100 lines and count lines & words in a single pass
for (file in filelistFullPath) {
lines <- readLines(file, warn = FALSE)
line_counts <- c(line_counts, length(lines)) # Count lines
word_counts <- c(word_counts, sum(str_count(lines, "\\S+"))) # Count words
}
names(basic_report) <- gsub("\\.txt$", "", filelist)
basic_report <- rbind(basic_report, line_counts, word_counts)
basic_report <- as.data.frame(lapply(basic_report, format_number))
rownames(basic_report) <- c("size(MB)", "Lines", "Words")
save_objects(c("basic_report"), CACHE_FILE)
}
| en_US.blogs | en_US.news | en_US.twitter | |
|---|---|---|---|
| size(MB) | 200 | 196 | 159 |
| Lines | 899,288 | 77,259 | 2,360,148 |
| Words | 37,334,131 | 2,643,969 | 30,373,543 |
Some words are more frequent than others - what are the distributions of word frequencies? Read a sample of 10,000 lines from all files
set.seed(12345)
CACHE_FILE <- "text_data"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(text_data))
{
sample_size <- 10000
# Define function to read a random sample of lines
read_random_sample <- function(file_path, sample_size = 10000) {
# Count total lines in the file
total_lines <- basic_report["Lines", gsub("\\.txt$", "", basename(file_path))] # Get total lines from `basic_report`
# Convert from character to numeric (since `basic_report` stores formatted numbers)
total_lines <- as.numeric(gsub(",", "", total_lines))
# Select random line numbers
sampled_lines <- sort(sample(1:total_lines, min(sample_size, total_lines)))
# Read only the sampled lines
text_data <- readLines(file_path, warn = FALSE)[sampled_lines]
return(text_data)
}
# Read random lines from all files
text_data <- unlist(lapply(filelistFullPath, read_random_sample))
save_objects(c("text_data"), CACHE_FILE)
}
Analyze word frequencies
library(tm) #Text Mining Package
CACHE_FILE <- "corpus"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(corpus))
{
# Create a Corpus
corpus <- Corpus(VectorSource(text_data))
# Clean Text Data
corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lowercase
corpus <- tm_map(corpus, removePunctuation) # Remove punctuation
corpus <- tm_map(corpus, removeNumbers) # Remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("en")) # Remove stopwords
corpus <- tm_map(corpus, stripWhitespace) # Remove extra spaces
# Create Term-Document Matrix
tdm <- TermDocumentMatrix(corpus)
word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
save_objects(c("corpus","tdm","word_freq","word_freq_df"), CACHE_FILE)
}
Word Cloud of Most Frequent Words
Bar Plot of Top 20 Words
What are the frequencies of 2-grams and 3-grams in the dataset?
# Convert Corpus to Plain Text
CACHE_FILE <- "ngrams"
loadObjects(CACHE_FILE)
if (!exists("bigram_df") || is.null(bigram_df))
{
text_clean <- sapply(corpus, as.character)
text_clean <- paste(text_clean, collapse = " ")
nGram <- function (wordsLength) {
tokenizer <- function(x) {
unlist(lapply(ngrams(words(x), wordsLength), paste, collapse = " "))
}
# Create Term-Document Matrices
tdm_ngram <- TermDocumentMatrix(VCorpus(VectorSource(text_clean)), control = list(tokenize = tokenizer))
# Convert Matrices to Data Frames
ngram_freq <- sort(rowSums(as.matrix(tdm_ngram)), decreasing = TRUE)
ngram_df <- data.frame(ngram = names(ngram_freq), freq = ngram_freq)
}
bigram_df <<- nGram(2)
trigram_df <<- nGram(3)
save_objects(c("bigram_df","trigram_df"), CACHE_FILE)
}
drawBars <- function(ds,title) {
ggplot(head(ds, 20), aes(x = reorder(ngram, -freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(title = title, x = "", y = "Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
drawBars(bigram_df,"Top 20 Most Frequent Bigrams")
drawBars(trigram_df,"Top 20 Most Frequent Trigrams")
drawCloud <- function(ds,title) {
wordcloud(words = ds$ngram, freq = ds$freq, min.freq = 10,
max.words = 100, colors = brewer.pal(8, "Dark2"), random.order = FALSE)
}
drawCloud(bigram_df)
drawCloud(trigram_df)
How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?
To determine how many unique words are needed to cover 50% and 90% of all word instances in the dataset, we need to:
library(dplyr)
# Compute cumulative sum and coverage percentages
total_words <- sum(word_freq_df$freq)
# Sort words by frequency and compute cumulative coverage
word_freq_df <- word_freq_df %>%
arrange(desc(freq)) %>% # Ensure sorting before cumulative sum
mutate(cumulative_freq = cumsum(freq),
coverage = cumulative_freq / total_words)
# Find word count needed to cover 50% & 90% of instances
words_50 <- nrow(word_freq_df[word_freq_df$coverage <= 0.50, ])
words_90 <- nrow(word_freq_df[word_freq_df$coverage <= 0.90, ])
Then to cover
50% we need 1062 words.
90% we need 15900 words.
How do you evaluate how many of the words come from foreign languages?
CACHE_FILE <- "detected_languages"
loadObjects(CACHE_FILE)
if (!exists(CACHE_FILE) || is.null(detected_languages))
{
library(textcat)
# Detect language of each text sample
detected_languages <- textcat(text_data)
# Count foreign language occurrences
detected_languages <- as.data.frame(table(detected_languages))
save_objects(c("detected_languages"), CACHE_FILE)
}
colnames(detected_languages) <- c("Language", "Count")
rownames(detected_languages) <- NULL
detected_languages <- detected_languages[order(-detected_languages$Count), ]
detected_languages <- head(detected_languages,10)
show_table(detected_languages,FALSE)
| Language | Count |
|---|---|
| english | 22215 |
| scots | 3764 |
| middle_frisian | 800 |
| german | 338 |
| catalan | 263 |
| danish | 247 |
| frisian | 243 |
| afrikaans | 183 |
| manx | 172 |
| rumantsch | 111 |
Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?
Instead of treating “running”, “ran”, and “runs” as separate words,
convert them to “run” to reduce dictionary size while keeping
meaning.
textstem:lemmatize_words()
Some important phrases (“New York”, “data science”) should not be
split into separate words. Detecting collocations ensures that
frequently co-occurring words are treated as single units.
quanteda:tokens(),textstat_collocations()
If a word does not appear in the dictionary, embeddings can find a
similar known word.
word2vec:read.wordvectors(),nearest_neighbors()
Typos and misspellings reduce coverage. Use fuzzy matching to map
misspelled words to known words.
hunspell:hunspell_suggest()
Instead of storing full words, break words into common subword
parts.
tokenizers::tokenize_words()