---
title: "milestone"
author: "Taiki Matsugi"
date: "2025-04-02"
output:
html_document:
toc: true
toc_float: true
theme: united
---
## Introduction
This report presents an exploratory analysis of English text data collected from three different sources (blogs, news, and Twitter). This analysis serves as a foundation for developing a text prediction algorithm.
## Data Loading and Basic Statistics
``` r
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” ggplot2 3.5.1 âś” tibble 3.2.1
## âś” lubridate 1.9.3 âś” tidyr 1.3.1
## âś” purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# Set data file paths
blogs_file <- "./data/en_US/en_US.blogs.txt"
news_file <- "./data/en_US/en_US.news.txt"
twitter_file <- "./data/en_US/en_US.twitter.txt"
# Define a safe_readLines function to handle potential errors
safe_readLines <- function(file_path) {
tryCatch({
readLines(file_path, encoding = "UTF-8", skipNul = TRUE)
}, error = function(e) {
warning(paste("Error reading file:", file_path, "\n", e$message))
NULL # Return NULL if there's an error
})
}
# Load data (with sampling)
set.seed(123) # For reproducibility
sample_size <- 10000 # Sampling to manage memory usage
blogs_text <- safe_readLines(blogs_file)
news_text <- safe_readLines(news_file)
twitter_text <- safe_readLines(twitter_file)
# Sampling
if (!is.null(blogs_text)) blogs_text <- sample(blogs_text, min(length(blogs_text), sample_size))
if (!is.null(news_text)) news_text <- sample(news_text, min(length(news_text), sample_size))
if (!is.null(twitter_text)) twitter_text <- sample(twitter_text, min(length(twitter_text), sample_size))
# Function to calculate basic statistics for each source
calculate_stats <- function(text, source_name) {
if (is.null(text)) {
return(data.frame(
Source = source_name,
Lines = 0,
TotalWords = 0,
UniqueWords = 0,
AvgWordsPerLine = 0,
MaxWordsPerLine = 0
))
}
words <- unlist(strsplit(tolower(paste(text, collapse = " ")), "\\s+"))
data.frame(
Source = source_name,
Lines = length(text),
TotalWords = length(words),
UniqueWords = length(unique(words)),
AvgWordsPerLine = mean(str_count(text, "\\S+")),
MaxWordsPerLine = max(str_count(text, "\\S+"))
)
}
# Calculate statistics
stats_df <- rbind(
calculate_stats(blogs_text, "Blogs"),
calculate_stats(news_text, "News"),
calculate_stats(twitter_text, "Twitter")
)
# Display results
knitr::kable(stats_df,
caption = "Basic Statistics by Data Source",
digits = 2,
format.args = list(big.mark = ","))
Source | Lines | TotalWords | UniqueWords | AvgWordsPerLine | MaxWordsPerLine |
---|---|---|---|---|---|
Blogs | 10,000 | 414,435 | 52,851 | 41.44 | 808 |
News | 10,000 | 341,001 | 50,539 | 34.10 | 265 |
10,000 | 129,029 | 24,943 | 12.90 | 35 |
# Create dataframe of sentence lengths
get_sentence_lengths <- function(text, source) {
if (is.null(text)) return(data.frame(length = numeric(0), source = character(0)))
data.frame(
length = str_count(text, "\\S+"),
source = source
)
}
# Combine sentence lengths from all sources
sentence_lengths <- rbind(
get_sentence_lengths(blogs_text, "Blogs"),
get_sentence_lengths(news_text, "News"),
get_sentence_lengths(twitter_text, "Twitter")
)
# Create histogram
if (nrow(sentence_lengths) > 0) {
print(ggplot(sentence_lengths, aes(x = length, fill = source)) +
geom_histogram(binwidth = 5, position = "dodge", alpha = 0.7) +
facet_wrap(~source, scales = "free_y") +
labs(title = "Distribution of Sentence Lengths by Source",
x = "Number of Words",
y = "Frequency") +
theme_minimal() +
scale_fill_brewer(palette = "Set2") +
theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14, hjust = 0.5)
))
}
# Function to create word clouds
create_wordcloud <- function(text, title) {
if (is.null(text) || length(text) == 0) {
plot.new()
title(main = paste(title, "(No Data)"))
return()
}
# Create and preprocess corpus
corpus <- Corpus(VectorSource(text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace) # Remove extra whitespace
# Calculate word frequencies
tdm <- TermDocumentMatrix(corpus)
word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
# Create word cloud
wordcloud(words = names(word_freq),
freq = word_freq,
max.words = 100,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"),
scale = c(3, 0.5), # Adjust scale for better readability
main = title)
}
# Display three word clouds in one figure
par(mfrow = c(1, 3), mar = c(2, 2, 2, 2))
create_wordcloud(blogs_text, "Blogs")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
create_wordcloud(news_text, "News")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
create_wordcloud(twitter_text, "Twitter")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents