library(stopwords)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(ggplot2)
library(tidytext)
library(stringr)
library(textclean)
library(ngram)
library(tokenizers)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(sentimentr)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(kernlab)

## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## 
## Attaching package: 'tm'
## 
## The following object is masked from 'package:stopwords':
## 
##     stopwords

library(SnowballC)

Expectations for EDA

I expect stopwords to be more popular than normal words.
I except that news will be significantly different than blog and twitter, as it is more formal and less personal.
I expect at least 60-70% of unique words sorted to cover 50% of all word instances in the language.
I expect foreign words to be possible, however their influence will be expected and wont damage the model as many foreign words used regularly in English, such as ‘ballet’ or ‘deja vu’ express concepts and ideas.
To increase the coverage, replacing words that have similar meanings with the more common instance can potentially reduce the word count to cover the same number of phrases.

url <- 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'

# Create a temporary file to store the downloaded ZIP archive
temp_zip <- tempfile()

# Download the ZIP file from the web
download.file(url, temp_zip, mode = "wb")  # mode = "wb" for binary files

# List the contents of the ZIP file
file_list <- unzip(temp_zip, list = TRUE)
print(file_list)

##                             Name    Length                Date
## 1                         final/         0 2014-07-22 10:10:00
## 2                   final/de_DE/         0 2014-07-22 10:10:00
## 3  final/de_DE/de_DE.twitter.txt  75578341 2014-07-22 10:11:00
## 4    final/de_DE/de_DE.blogs.txt  85459666 2014-07-22 10:11:00
## 5     final/de_DE/de_DE.news.txt  95591959 2014-07-22 10:11:00
## 6                   final/ru_RU/         0 2014-07-22 10:10:00
## 7    final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
## 8     final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
## 9  final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
## 10                  final/en_US/         0 2014-07-22 10:10:00
## 11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
## 12    final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
## 13   final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
## 14                  final/fi_FI/         0 2014-07-22 10:10:00
## 15    final/fi_FI/fi_FI.news.txt  94234350 2014-07-22 10:11:00
## 16   final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
## 17 final/fi_FI/fi_FI.twitter.txt  25331142 2014-07-22 10:10:00

twitter_file <- file_list$Name[11]
news_file <- file_list$Name[12]
blog_file <- file_list$Name[13]

twitter_con <- unz(temp_zip, twitter_file)

news_con <- unz(temp_zip, news_file)

blog_con <- unz(temp_zip, blog_file)

# Partitioning the data

set.seed(54321)

# reading lines
twitter_lines <- readLines(twitter_con, skipNul=T)
news_lines <- readLines(news_con, skipNul=T)
blog_lines <- readLines(blog_con, skipNul=T)

# Splitting the data
# Twitter
twitter_indices <- sample(seq_along(twitter_lines), size = 0.7 * length(twitter_lines))
twitter_remaining_indices <- setdiff(seq_along(twitter_lines), twitter_indices)
twitter_validation_indices <- sample(twitter_remaining_indices, size = 0.5 * length(twitter_remaining_indices))
twitter_test_indices <- setdiff(twitter_remaining_indices, twitter_validation_indices)

# Split the data
twitter_train <- twitter_lines[twitter_indices]
twitter_valid <- twitter_lines[twitter_validation_indices]
twitter_test <- twitter_lines[twitter_test_indices]

# News
news_indices <- sample(seq_along(news_lines), size = 0.7 * length(news_lines))
news_remaining_indices <- setdiff(seq_along(news_lines), news_indices)
news_validation_indices <- sample(news_remaining_indices, size = 0.5 * length(news_remaining_indices))
news_test_indices <- setdiff(news_remaining_indices, news_validation_indices)

# Split the data
news_train <- news_lines[news_indices]
news_valid <- news_lines[news_validation_indices]
news_test <- news_lines[news_test_indices]

# Blog
blog_indices <- sample(seq_along(blog_lines), size = 0.7 * length(blog_lines))
blog_remaining_indices <- setdiff(seq_along(blog_lines), blog_indices)
blog_validation_indices <- sample(blog_remaining_indices, size = 0.5 * length(blog_remaining_indices))
blog_test_indices <- setdiff(blog_remaining_indices, blog_validation_indices)

# Split the data
blog_train <- blog_lines[blog_indices]
blog_valid <- blog_lines[blog_validation_indices]
blog_test <- blog_lines[blog_test_indices]

# Sample the training data for analysis purposes
# sampling lines
sample_chunk = 10000

twitterSamp <- sample(twitter_train, size=sample_chunk, replace=FALSE)
newsSamp <- sample(news_train, size=sample_chunk, replace=FALSE)
blogSamp <- sample(blog_train, size=sample_chunk, replace=FALSE)

close(blog_con)
close(twitter_con)
close(news_con)
unlink(temp_zip)

twitter_colour <- "#1DA1F2"
news_colour <- "#333333"
blog_colour <- "#21759B"

TOKENIZATION

Profanity filtering - removing profanity and other words you do not want to predict.

I expect twitter to be more laden with profanity than news or blogs, as this is a personal site.

profanity_list <- unique(c(lexicon::profanity_alvarez, 
                           lexicon::profanity_arr_bad, 
                           lexicon::profanity_arr_bad, 
                           lexicon::profanity_banned, 
                           lexicon::profanity_zac_anger))

sum(profanity(twitterSamp, profanity_list = profanity_list)$profanity_count)

## [1] 1439

sum(profanity(newsSamp, profanity_list = profanity_list)$profanity_count)

## [1] 2182

sum(profanity(blogSamp, profanity_list = profanity_list)$profanity_count)

## [1] 2536

sum(profanity(twitterSamp, profanity_list = profanity_list)$profanity)/length(twitterSamp)

## [1] 0.02335812

sum(profanity(newsSamp, profanity_list = profanity_list)$profanity)/length(newsSamp)

## [1] 0.01204824

sum(profanity(blogSamp, profanity_list = profanity_list)$profanity)/length(blogSamp)

## [1] 0.01753527

While news and blog posts had a higher count of profanity, the total percentage is smaller when compared to twitter. This likely accounts for the respective lengths and sizes of posts for each group.

Now I will first tokenize and transform the case of these texts to lower before removing profanity.

# Save tokenized version of each text
twitter_token <- tokenize_words(twitterSamp)
news_token <- tokenize_words(newsSamp)
blog_token <- tokenize_words(blogSamp)

# Cleaning the lists of profanity 
clean_tweet <- lapply(twitter_token, function(words) setdiff(words, unique(profanity_list)))
clean_news <- lapply(news_token, function(words) setdiff(words, unique(profanity_list)))
clean_blog <- lapply(blog_token, function(words) setdiff(words, unique(profanity_list)))

Lists have been tokenized and profanity has been removed. Now, some preliminary analysis will be done on the distribution of words and word frequencies, while also carrying out lemmatization, stopwords removal and correcting mispelled words.

EDA

Count the average number of characters in each dataset

Expectations: 1. News articles and blog posts should have more characters than twitter posts due to a word limit per post on twitter’s site. 2. There may be more variation in the former types of text. However, I’m unsure how much variation is too much. I think this will involve a deeper dive into the words and characters themselves.

par(mfrow = c(3,1))
hist(nchar(clean_tweet), main="Character distribution for Twitter Posts", xlab="Number of Characters", col="#1DA1F2")
hist(nchar(clean_news), main="Character distribution for News Articles", xlab="Number of Characters", col="#333333")
hist(nchar(clean_blog), main="Character distribution for Blog Posts", xlab="Number of Characters", col="#21759B")

Twitter has an average of 50 characters with most between 25-100, with a range from 0 to 220, while news has an average of around 250-300 characters, with a range from 0 to 1500 in extreme cases. Finally, blog has an average between 0-500, with a range from 0 to 3500 in extreme cases. In conclusion, it is correct that news and blog posts have a greater number of characters per post.

Count the number of words appearing in each text

Expectations 1. I expect that again blog posts and news articles will have more words per sentence than twitter posts. 2. I expect there to be 3x to 5x more words in a news article and blog than a twitter post.

par(mfrow=c(3,1))
hist(sapply(clean_tweet, length), main="Word distribution for Twitter Posts", xlab="Number of Words", col="#1DA1F2")
hist(sapply(clean_news, length), main="Word distribution for News Articles", xlab="Number of Words", col="#333333")
hist(sapply(clean_blog, length), main="Word distribution for Blog posts", xlab="Number of Words", col="#21759B")

It is correct that on average, news articles and blog posts have more words than twitter posts. Interestingly, The distribution is similar to the respective distributions for the number of characters. This may indicate that while the news and blog posts are longer, the words they use are similar.

Average word length

Expectations 1. Average word length will be 4-5 characters, which accounts for stop words and potential longer words.

mean_word_length <- function(tokenized_word) {
  # Getting the length of each string
  word_lengths <- lapply(tokenized_word, str_length)
  
  # Get the average word length
  avg_length <- lapply(word_lengths, mean)
  
  # Return the avg_length
  return(avg_length)
}

par(mfrow=c(3,1))
hist(unlist(mean_word_length(clean_tweet)), main="Avg word length for Twitter Posts", xlab="Number of characters", col="#1DA1F2")
hist(unlist(mean_word_length(clean_news)), main="Avg word length for News Articles", xlab="Number of characters", col="#333333")
hist(unlist(mean_word_length(clean_blog)), main="Avg word length for Blog Posts", xlab="Number of characters", col="#21759B")

Between 4-5 was correct, with news articles and blog posts having only slightly higher average word length When compared to twitter posts, which contains a distribution of words less than 4 characters. Blogs has a similar occurrence, where there is higher frequency of words less than 5 characters than higher. While news contains a high distribution of words greater than 5 in character length.

Investigating stop words

I expect stopwords to be as popular in each of the three posts in terms of percentages, but with higher counts in news and blogs. The types of stopwords used will be potentially different. For example, news may be more general using “we” while twitter and blogs may use “I” or “me”.

stop_words <- data.frame(word=stopwords("en"), stringsAsFactors=FALSE)

# Stopwords count for each dataset
count_total_stopwords <- function(tokenized_list) {
  # Convert the tokenized list to a dataframe
  tokenized_df <- data.frame(word = unlist(tokenized_list), stringsAsFactors = FALSE)
  
  # Count total stop words
  stopword_count <- tokenized_df %>%
    inner_join(stop_words, by = "word") %>%
    tally() %>%
    pull(n) # Return the total count as a single number
  
  return(stopword_count)
}

# Count stopwords in each dataset
twitter_count <- count_total_stopwords(clean_tweet)
news_count <- count_total_stopwords(clean_news)
blog_count <- count_total_stopwords(clean_blog)

# Combine the results into a single data frame
stopword_summary <- data.frame(
  dataset = c("Twitter", "News", "Blog"),
  total_stopwords = c(twitter_count, news_count, blog_count)
)

# Visualize with a bar chart
ggplot(stopword_summary, aes(x = dataset, y = total_stopwords, fill = dataset)) +
  geom_col() + # Bar chart
  labs(
    title = "Comparison of Total Stop Words Across Datasets",
    x = "Dataset",
    y = "Total Stop Words"
  ) +
  theme_minimal() + # Apply a clean theme
  theme(legend.position = "none") # Remove legend

Blogs has more stopwords than news, with a total of 16k compared with around 12.5k for news, and significantly more than twitter, which has around 6k. This is interesting, as blogs are typically longer than news articles, however, it makes me question the writing style and quality, and if the difference in stopwords is proportional to the respective post length.

Distribution of stop words

count_stopwords <- function(tokenized_list){
  # Turn the tokenized list in a dataframe
  tokenized_df <- data.frame(word = unlist(tokenized_list), stringsAsFactors=FALSE)
  
  # Filter for stop words
    tokenized_df %>%
      inner_join(stop_words, by="word") %>%
      count(word, sort = TRUE)
}

twitter_stopwords <- count_stopwords(clean_tweet)
news_stopwords <- count_stopwords(clean_news)
blog_stopwords <- count_stopwords(clean_blog)

par(mfrow=c(3,1))
# Twitter: Visualize with a bar chart
twitter_stopwords %>%
  slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
  ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
  geom_col(fill = twitter_colour) + # Bar chart with blue bars
  coord_flip() + # Flip coordinates for horizontal bars
  labs(
    title = "Top 10 Most Frequent Stopwords in Twitter Dataset",
    x = "Stop Words",
    y = "Count"
  ) +
  theme_minimal()

# News: Visualize with a bar chart
news_stopwords %>%
  slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
  ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
  geom_col(fill = news_colour) + # Bar chart with blue bars
  coord_flip() + # Flip coordinates for horizontal bars
  labs(
    title = "Top 10 Most Frequent Stopwords in news Dataset",
    x = "Stop Words",
    y = "Count"
  ) +
  theme_minimal()

# Blog: Visualize with a bar chart
blog_stopwords %>%
  slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
  ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
  geom_col(fill = blog_colour) + # Bar chart with blue bars
  coord_flip() + # Flip coordinates for horizontal bars
  labs(
    title = "Top 10 Most Frequent Stopwords in Blog Dataset",
    x = "Stop Words",
    y = "Count"
  ) +
  theme_minimal()

The three different groups have the same number one stopword – “the” – with some other popular stopwords like “and”, “to” and “a”. Twitter’s third most popular stopword is “I”, and its fifth most popular is “you”. However, for blog “I” is the 7th most popular stopword. Compare this to news, which does not contain instances of “I” or “you” in its top ten most popular stopwords. My expectation was correct, that it is not as prevalent for news articles to use personal language. Having this variation is good for the model, as it provides useful functionality for mobile phone users.

Counting occurrences of unique words

I expect that twitter will have more personal and emotional unique words, whereas news will be more objective and factual. Blog posts will lie somewhere in between.

count_unique_words <- function(tokenized_list){
  # Turn the tokenized list in a dataframe
  tokenized_df <- data.frame(word = unlist(tokenized_list), stringsAsFactors=FALSE)
  
  # Filter for stop words
  tokenized_df %>%
    anti_join(stop_words, by="word") %>%
    count(word, sort = TRUE)
}

twitter_unique <- count_unique_words(clean_tweet)
news_unique <- count_unique_words(clean_news)
blog_unique <- count_unique_words(clean_blog)
(nrow(news_unique) + nrow(blog_unique)) / 2

## [1] 30341.5

twitter_unique %>%
  slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
  ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
  geom_col(fill = twitter_colour) + # Bar chart with blue bars
  coord_flip() + # Flip coordinates for horizontal bars
  labs(
    title = "Top 10 Most Frequent Unique Words in Twitter Dataset",
    x = "Unique Words",
    y = "Count"
  ) +
  theme_minimal()

news_unique %>%
  slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
  ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
  geom_col(fill = news_colour) + # Bar chart with blue bars
  coord_flip() + # Flip coordinates for horizontal bars
  labs(
    title = "Top 10 Most Frequent Unique Words in News Dataset",
    x = "Unique Words",
    y = "Count"
  ) +
  theme_minimal()

blog_unique %>%
  slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
  ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
  geom_col(fill = blog_colour) + # Bar chart with blue bars
  coord_flip() + # Flip coordinates for horizontal bars
  labs(
    title = "Top 10 Most Frequent Unique Words in Blog Dataset",
    x = "Unique Words",
    y = "Count"
  ) +
  theme_minimal()

The expectation holds true, as the twitter data-sets top four words are “love”, “good”, “day” and “rt” (potentially: “retweet”), while the news data-set is “year”, “time”, “state” and “years”, with “people” and “back” fifth and sixth respectively. Finally, the blogs data-set contained a mix of both, with “time” as its top word, with “back”, “make” and “people” being second, third, and fourth respectively. While this is more similar to the distribution of words in the news articles, it does contain emotional words such as “good” in fifth and “love” in seventh. This indicates that there is more personal messaging in blogs than news, but not as much as in the twitter data-set.

Milestone Report

spemurphy

2025-01-03