Introduction

This Milestone Report shows the initial steps in the Data Science Capstone project using the SwiftKey English corpora dataset.
The goal of this report is to:

Loading the Data

# Set seed for reproducibility
set.seed(123)

# Sample 10,000 lines from each dataset (faster for beginners)
blogs_sample <- sample(readLines("SwiftKey/final/en_US/en_US.blogs.txt", warn = FALSE), 10000)
news_sample <- sample(readLines("SwiftKey/final/en_US/en_US.news.txt", warn = FALSE), 10000)
twitter_sample <- sample(readLines("SwiftKey/final/en_US/en_US.twitter.txt", warn = FALSE), 10000)

# Check that the samples were loaded
cat("Number of lines in each sampled dataset:\n")
## Number of lines in each sampled dataset:
cat("Blogs:", length(blogs_sample), "\n")
## Blogs: 10000
cat("News:", length(news_sample), "\n")
## News: 10000
cat("Twitter:", length(twitter_sample), "\n")
## Twitter: 10000
# Inspect first few lines
cat("\nFirst 3 lines of Blogs:\n")
## 
## First 3 lines of Blogs:
head(blogs_sample, 3)
## [1] "The bruschetta however, missed the mark. Instead of manageable two-bite crostini, these were huge slices of grilled bread and heaped with toppings of tomato, cannellini beans and roasted peppers with goat cheese."                                                                                                                                                                                                              
## [2] "Walden Pond, Mt. Rainier, Big Sur, Everglades and so forth;"                                                                                                                                                                                                                                                                                                                                                                       
## [3] "Despite laws banning cell phones while driving and increased awareness of the dangers of doing so, it’s a common fact that cell phone use while driving is still a widespread occurrence. Perhaps most discouraging to the issue is that much of this distracted driving occurs amongst young drivers, which is not only a safety concern, but also might indicate that the problem could be deeply rooted for future generations."
cat("\nFirst 3 lines of News:\n")
## 
## First 3 lines of News:
head(news_sample, 3)
## [1] "In general, however, the defense faces an extremely high hurdle in getting a new trial because of a juror's actions or public statements after the jury has announced its verdict in a criminal case." 
## [2] "Tuominen attributes the school's success to strong academic expectations and solid citizenship. Students are asked to follow an \"HONOR\" code – to be Honest, On time, Noble, On task and Respectful."
## [3] "The indictment alleges that Giudice used his brother's information to obtain a driver's license. His license had been suspended after a DUI."
cat("\nFirst 3 lines of Twitter:\n")
## 
## First 3 lines of Twitter:
head(twitter_sample, 3)
## [1] "You can't beat 11 hours of power. Somebody get me a 5 hour energy drink"                                  
## [2] "Add me, love me and love my stuff. Ha ha ha"                                                              
## [3] "Not on site so can't verify if is in the house - but thought that was a good guess! Hope you get tickets!"

Cleaning the data

# Combine sampled datasets into one corpus
library(tm)
## Loading required package: NLP
library(stringi)

corpus <- Corpus(VectorSource(c(blogs_sample, news_sample, twitter_sample)))

# Clean the text
corpus <- tm_map(corpus, content_transformer(tolower))       
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)                  
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)                      
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("en"))       
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("en")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)                    
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
# Inspect cleaned text
inspect(corpus[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1]  bruschetta however missed mark instead manageable twobite crostini huge slices grilled bread heaped toppings tomato cannellini beans roasted peppers goat cheese                                                                                                                                 
## [2] walden pond mt rainier big sur everglades forth                                                                                                                                                                                                                                                   
## [3] despite laws banning cell phones driving increased awareness dangers ’s common fact cell phone use driving still widespread occurrence perhaps discouraging issue much distracted driving occurs amongst young drivers safety concern also might indicate problem deeply rooted future generations

Tokenization

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)

# Combine sampled datasets into a data frame
text_df <- data.frame(text = c(blogs_sample, news_sample, twitter_sample))

# Unigrams (single words)
unigrams <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

# Inspect top 10 unigrams
head(unigrams, 10)
##    word     n
## 1   the 44248
## 2    to 24263
## 3   and 23036
## 4     a 21150
## 5    of 18917
## 6    in 15041
## 7     i 13296
## 8  that  9632
## 9   for  9195
## 10   is  9024
# Bigrams (2-word sequences)
bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

# Inspect top 10 bigrams
head(bigrams, 10)
##      bigram    n
## 1    of the 4129
## 2    in the 3910
## 3    to the 1964
## 4    on the 1741
## 5   for the 1704
## 6     to be 1440
## 7   and the 1262
## 8    at the 1171
## 9      in a 1059
## 10 with the 1013
# Trigrams (3-word sequences)
trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

# Inspect top 10 trigrams
head(trigrams, 10)
##        trigram   n
## 1         <NA> 881
## 2   one of the 335
## 3     a lot of 264
## 4   the end of 169
## 5      to be a 151
## 6   out of the 138
## 7  some of the 138
## 8   as well as 137
## 9  going to be 137
## 10    it was a 131

Word Frequency Analysis

library(dplyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(tidytext)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
# Combine sampled datasets into a data frame
text_df <- data.frame(text = c(blogs_sample, news_sample, twitter_sample))

# Unigrams (single words) frequency table
unigrams <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

# Plot top 10 unigrams
head(unigrams, 10) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Unigrams", x = "Word", y = "Frequency")

# Keep only top 100 words for the word cloud
top_unigrams <- head(unigrams, 100)

# Generate a word cloud (top 100 words)
wordcloud(words = top_unigrams$word,
          freq = top_unigrams$n,
          min.freq = 10,
          scale = c(3, 0.5),
          colors = brewer.pal(8, "Dark2"))

Exploratory Analysis of N-Grams

library(dplyr)
library(tidytext)

# Create a data frame from your sampled datasets
text_df <- data.frame(text = c(blogs_sample, news_sample, twitter_sample))

# Unigrams
unigrams <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

# Bigrams
bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

# Trigrams
trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

Summary of Findings

  • Most common unigrams: “the”, “to”, “and”, “a”, “of”
  • Most common bigrams: “of the”, “in the”, “to the”
  • Most common trigrams: “one of the”, “a lot of”, “the end of”
  • These top words and n-grams reflect common English usage in blogs, news, and Twitter.

These results help identify patterns for building a predictive text model.