library(stopwords)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(ggplot2)
library(tidytext)
library(stringr)
library(textclean)
library(ngram)
library(tokenizers)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sentimentr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(kernlab)
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
##
##
## Attaching package: 'tm'
##
## The following object is masked from 'package:stopwords':
##
## stopwords
library(SnowballC)
url <- 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
# Create a temporary file to store the downloaded ZIP archive
temp_zip <- tempfile()
# Download the ZIP file from the web
download.file(url, temp_zip, mode = "wb") # mode = "wb" for binary files
# List the contents of the ZIP file
file_list <- unzip(temp_zip, list = TRUE)
print(file_list)
## Name Length Date
## 1 final/ 0 2014-07-22 10:10:00
## 2 final/de_DE/ 0 2014-07-22 10:10:00
## 3 final/de_DE/de_DE.twitter.txt 75578341 2014-07-22 10:11:00
## 4 final/de_DE/de_DE.blogs.txt 85459666 2014-07-22 10:11:00
## 5 final/de_DE/de_DE.news.txt 95591959 2014-07-22 10:11:00
## 6 final/ru_RU/ 0 2014-07-22 10:10:00
## 7 final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
## 8 final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
## 9 final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
## 10 final/en_US/ 0 2014-07-22 10:10:00
## 11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
## 12 final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
## 13 final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
## 14 final/fi_FI/ 0 2014-07-22 10:10:00
## 15 final/fi_FI/fi_FI.news.txt 94234350 2014-07-22 10:11:00
## 16 final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
## 17 final/fi_FI/fi_FI.twitter.txt 25331142 2014-07-22 10:10:00
twitter_file <- file_list$Name[11]
news_file <- file_list$Name[12]
blog_file <- file_list$Name[13]
twitter_con <- unz(temp_zip, twitter_file)
news_con <- unz(temp_zip, news_file)
blog_con <- unz(temp_zip, blog_file)
# Partitioning the data
set.seed(54321)
# reading lines
twitter_lines <- readLines(twitter_con, skipNul=T)
news_lines <- readLines(news_con, skipNul=T)
blog_lines <- readLines(blog_con, skipNul=T)
# Splitting the data
# Twitter
twitter_indices <- sample(seq_along(twitter_lines), size = 0.7 * length(twitter_lines))
twitter_remaining_indices <- setdiff(seq_along(twitter_lines), twitter_indices)
twitter_validation_indices <- sample(twitter_remaining_indices, size = 0.5 * length(twitter_remaining_indices))
twitter_test_indices <- setdiff(twitter_remaining_indices, twitter_validation_indices)
# Split the data
twitter_train <- twitter_lines[twitter_indices]
twitter_valid <- twitter_lines[twitter_validation_indices]
twitter_test <- twitter_lines[twitter_test_indices]
# News
news_indices <- sample(seq_along(news_lines), size = 0.7 * length(news_lines))
news_remaining_indices <- setdiff(seq_along(news_lines), news_indices)
news_validation_indices <- sample(news_remaining_indices, size = 0.5 * length(news_remaining_indices))
news_test_indices <- setdiff(news_remaining_indices, news_validation_indices)
# Split the data
news_train <- news_lines[news_indices]
news_valid <- news_lines[news_validation_indices]
news_test <- news_lines[news_test_indices]
# Blog
blog_indices <- sample(seq_along(blog_lines), size = 0.7 * length(blog_lines))
blog_remaining_indices <- setdiff(seq_along(blog_lines), blog_indices)
blog_validation_indices <- sample(blog_remaining_indices, size = 0.5 * length(blog_remaining_indices))
blog_test_indices <- setdiff(blog_remaining_indices, blog_validation_indices)
# Split the data
blog_train <- blog_lines[blog_indices]
blog_valid <- blog_lines[blog_validation_indices]
blog_test <- blog_lines[blog_test_indices]
# Sample the training data for analysis purposes
# sampling lines
sample_chunk = 10000
twitterSamp <- sample(twitter_train, size=sample_chunk, replace=FALSE)
newsSamp <- sample(news_train, size=sample_chunk, replace=FALSE)
blogSamp <- sample(blog_train, size=sample_chunk, replace=FALSE)
close(blog_con)
close(twitter_con)
close(news_con)
unlink(temp_zip)
twitter_colour <- "#1DA1F2"
news_colour <- "#333333"
blog_colour <- "#21759B"
I expect twitter to be more laden with profanity than news or blogs, as this is a personal site.
profanity_list <- unique(c(lexicon::profanity_alvarez,
lexicon::profanity_arr_bad,
lexicon::profanity_arr_bad,
lexicon::profanity_banned,
lexicon::profanity_zac_anger))
sum(profanity(twitterSamp, profanity_list = profanity_list)$profanity_count)
## [1] 1439
sum(profanity(newsSamp, profanity_list = profanity_list)$profanity_count)
## [1] 2182
sum(profanity(blogSamp, profanity_list = profanity_list)$profanity_count)
## [1] 2536
sum(profanity(twitterSamp, profanity_list = profanity_list)$profanity)/length(twitterSamp)
## [1] 0.02335812
sum(profanity(newsSamp, profanity_list = profanity_list)$profanity)/length(newsSamp)
## [1] 0.01204824
sum(profanity(blogSamp, profanity_list = profanity_list)$profanity)/length(blogSamp)
## [1] 0.01753527
While news and blog posts had a higher count of profanity, the total percentage is smaller when compared to twitter. This likely accounts for the respective lengths and sizes of posts for each group.
Now I will first tokenize and transform the case of these texts to lower before removing profanity.
# Save tokenized version of each text
twitter_token <- tokenize_words(twitterSamp)
news_token <- tokenize_words(newsSamp)
blog_token <- tokenize_words(blogSamp)
# Cleaning the lists of profanity
clean_tweet <- lapply(twitter_token, function(words) setdiff(words, unique(profanity_list)))
clean_news <- lapply(news_token, function(words) setdiff(words, unique(profanity_list)))
clean_blog <- lapply(blog_token, function(words) setdiff(words, unique(profanity_list)))
Lists have been tokenized and profanity has been removed. Now, some preliminary analysis will be done on the distribution of words and word frequencies, while also carrying out lemmatization, stopwords removal and correcting mispelled words.
Expectations: 1. News articles and blog posts should have more characters than twitter posts due to a word limit per post on twitter’s site. 2. There may be more variation in the former types of text. However, I’m unsure how much variation is too much. I think this will involve a deeper dive into the words and characters themselves.
par(mfrow = c(3,1))
hist(nchar(clean_tweet), main="Character distribution for Twitter Posts", xlab="Number of Characters", col="#1DA1F2")
hist(nchar(clean_news), main="Character distribution for News Articles", xlab="Number of Characters", col="#333333")
hist(nchar(clean_blog), main="Character distribution for Blog Posts", xlab="Number of Characters", col="#21759B")
Twitter has an average of 50 characters with most between 25-100, with a range from 0 to 220, while news has an average of around 250-300 characters, with a range from 0 to 1500 in extreme cases. Finally, blog has an average between 0-500, with a range from 0 to 3500 in extreme cases. In conclusion, it is correct that news and blog posts have a greater number of characters per post.
Expectations 1. I expect that again blog posts and news articles will have more words per sentence than twitter posts. 2. I expect there to be 3x to 5x more words in a news article and blog than a twitter post.
par(mfrow=c(3,1))
hist(sapply(clean_tweet, length), main="Word distribution for Twitter Posts", xlab="Number of Words", col="#1DA1F2")
hist(sapply(clean_news, length), main="Word distribution for News Articles", xlab="Number of Words", col="#333333")
hist(sapply(clean_blog, length), main="Word distribution for Blog posts", xlab="Number of Words", col="#21759B")
It is correct that on average, news articles and blog posts have more words than twitter posts. Interestingly, The distribution is similar to the respective distributions for the number of characters. This may indicate that while the news and blog posts are longer, the words they use are similar.
Expectations 1. Average word length will be 4-5 characters, which accounts for stop words and potential longer words.
mean_word_length <- function(tokenized_word) {
# Getting the length of each string
word_lengths <- lapply(tokenized_word, str_length)
# Get the average word length
avg_length <- lapply(word_lengths, mean)
# Return the avg_length
return(avg_length)
}
par(mfrow=c(3,1))
hist(unlist(mean_word_length(clean_tweet)), main="Avg word length for Twitter Posts", xlab="Number of characters", col="#1DA1F2")
hist(unlist(mean_word_length(clean_news)), main="Avg word length for News Articles", xlab="Number of characters", col="#333333")
hist(unlist(mean_word_length(clean_blog)), main="Avg word length for Blog Posts", xlab="Number of characters", col="#21759B")
Between 4-5 was correct, with news articles and blog posts having only slightly higher average word length When compared to twitter posts, which contains a distribution of words less than 4 characters. Blogs has a similar occurrence, where there is higher frequency of words less than 5 characters than higher. While news contains a high distribution of words greater than 5 in character length.
stop_words <- data.frame(word=stopwords("en"), stringsAsFactors=FALSE)
# Stopwords count for each dataset
count_total_stopwords <- function(tokenized_list) {
# Convert the tokenized list to a dataframe
tokenized_df <- data.frame(word = unlist(tokenized_list), stringsAsFactors = FALSE)
# Count total stop words
stopword_count <- tokenized_df %>%
inner_join(stop_words, by = "word") %>%
tally() %>%
pull(n) # Return the total count as a single number
return(stopword_count)
}
# Count stopwords in each dataset
twitter_count <- count_total_stopwords(clean_tweet)
news_count <- count_total_stopwords(clean_news)
blog_count <- count_total_stopwords(clean_blog)
# Combine the results into a single data frame
stopword_summary <- data.frame(
dataset = c("Twitter", "News", "Blog"),
total_stopwords = c(twitter_count, news_count, blog_count)
)
# Visualize with a bar chart
ggplot(stopword_summary, aes(x = dataset, y = total_stopwords, fill = dataset)) +
geom_col() + # Bar chart
labs(
title = "Comparison of Total Stop Words Across Datasets",
x = "Dataset",
y = "Total Stop Words"
) +
theme_minimal() + # Apply a clean theme
theme(legend.position = "none") # Remove legend
Blogs has more stopwords than news, with a total of 16k compared with around 12.5k for news, and significantly more than twitter, which has around 6k. This is interesting, as blogs are typically longer than news articles, however, it makes me question the writing style and quality, and if the difference in stopwords is proportional to the respective post length.
count_stopwords <- function(tokenized_list){
# Turn the tokenized list in a dataframe
tokenized_df <- data.frame(word = unlist(tokenized_list), stringsAsFactors=FALSE)
# Filter for stop words
tokenized_df %>%
inner_join(stop_words, by="word") %>%
count(word, sort = TRUE)
}
twitter_stopwords <- count_stopwords(clean_tweet)
news_stopwords <- count_stopwords(clean_news)
blog_stopwords <- count_stopwords(clean_blog)
par(mfrow=c(3,1))
# Twitter: Visualize with a bar chart
twitter_stopwords %>%
slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
geom_col(fill = twitter_colour) + # Bar chart with blue bars
coord_flip() + # Flip coordinates for horizontal bars
labs(
title = "Top 10 Most Frequent Stopwords in Twitter Dataset",
x = "Stop Words",
y = "Count"
) +
theme_minimal()
# News: Visualize with a bar chart
news_stopwords %>%
slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
geom_col(fill = news_colour) + # Bar chart with blue bars
coord_flip() + # Flip coordinates for horizontal bars
labs(
title = "Top 10 Most Frequent Stopwords in news Dataset",
x = "Stop Words",
y = "Count"
) +
theme_minimal()
# Blog: Visualize with a bar chart
blog_stopwords %>%
slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
geom_col(fill = blog_colour) + # Bar chart with blue bars
coord_flip() + # Flip coordinates for horizontal bars
labs(
title = "Top 10 Most Frequent Stopwords in Blog Dataset",
x = "Stop Words",
y = "Count"
) +
theme_minimal()
The three different groups have the same number one stopword – “the” – with some other popular stopwords like “and”, “to” and “a”. Twitter’s third most popular stopword is “I”, and its fifth most popular is “you”. However, for blog “I” is the 7th most popular stopword. Compare this to news, which does not contain instances of “I” or “you” in its top ten most popular stopwords. My expectation was correct, that it is not as prevalent for news articles to use personal language. Having this variation is good for the model, as it provides useful functionality for mobile phone users.
count_unique_words <- function(tokenized_list){
# Turn the tokenized list in a dataframe
tokenized_df <- data.frame(word = unlist(tokenized_list), stringsAsFactors=FALSE)
# Filter for stop words
tokenized_df %>%
anti_join(stop_words, by="word") %>%
count(word, sort = TRUE)
}
twitter_unique <- count_unique_words(clean_tweet)
news_unique <- count_unique_words(clean_news)
blog_unique <- count_unique_words(clean_blog)
(nrow(news_unique) + nrow(blog_unique)) / 2
## [1] 30341.5
twitter_unique %>%
slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
geom_col(fill = twitter_colour) + # Bar chart with blue bars
coord_flip() + # Flip coordinates for horizontal bars
labs(
title = "Top 10 Most Frequent Unique Words in Twitter Dataset",
x = "Unique Words",
y = "Count"
) +
theme_minimal()
news_unique %>%
slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
geom_col(fill = news_colour) + # Bar chart with blue bars
coord_flip() + # Flip coordinates for horizontal bars
labs(
title = "Top 10 Most Frequent Unique Words in News Dataset",
x = "Unique Words",
y = "Count"
) +
theme_minimal()
blog_unique %>%
slice_max(n, n = 10) %>% # Select the top 10 most frequent stop words
ggplot(aes(x = reorder(word, n), y = n)) + # Reorder words based on count
geom_col(fill = blog_colour) + # Bar chart with blue bars
coord_flip() + # Flip coordinates for horizontal bars
labs(
title = "Top 10 Most Frequent Unique Words in Blog Dataset",
x = "Unique Words",
y = "Count"
) +
theme_minimal()
The expectation holds true, as the twitter data-sets top four words are “love”, “good”, “day” and “rt” (potentially: “retweet”), while the news data-set is “year”, “time”, “state” and “years”, with “people” and “back” fifth and sixth respectively. Finally, the blogs data-set contained a mix of both, with “time” as its top word, with “back”, “make” and “people” being second, third, and fourth respectively. While this is more similar to the distribution of words in the news articles, it does contain emotional words such as “good” in fifth and “love” in seventh. This indicates that there is more personal messaging in blogs than news, but not as much as in the twitter data-set.