This report presents an exploratory data analysis of three English text datasets (blogs, news, and Twitter) that will be used to build a next word prediction model. The analysis reveals key characteristics of the text data including file sizes, word frequencies, and linguistic patterns that will inform the development of an effective prediction algorithm.
The dataset consists of three files containing English text from different sources:
# Function to safely get file info
get_file_info <- function(file_path) {
if (file.exists(file_path)) {
info <- file.info(file_path)
lines <- readLines(file_path, warn = FALSE, n = 1000) # Sample for speed
return(list(
size_mb = round(info$size / 1024^2, 2),
sample_lines = length(lines),
estimated_total = info$size / mean(nchar(lines, allowNA = TRUE), na.rm = TRUE)
))
} else {
return(list(size_mb = 0, sample_lines = 0, estimated_total = 0))
}
}
# Get file information
files_info <- data.frame(
Source = c("Blogs", "News", "Twitter"),
File = c("en_US/en_US.blogs.txt", "en_US/en_US.news.txt", "en_US/en_US.twitter.txt"),
stringsAsFactors = FALSE
)
# Add file statistics
for (i in 1:nrow(files_info)) {
info <- get_file_info(files_info$File[i])
files_info$Size_MB[i] <- info$size_mb
files_info$Sample_Lines[i] <- info$sample_lines
files_info$Est_Total_Lines[i] <- round(info$estimated_total, 0)
}
# Display file information table
kable(files_info[, c("Source", "Size_MB", "Sample_Lines", "Est_Total_Lines")],
col.names = c("Data Source", "Size (MB)", "Sample Lines", "Est. Total Lines"),
caption = "Overview of Text Data Files")
Data Source | Size (MB) | Sample Lines | Est. Total Lines |
---|---|---|---|
Blogs | 200.42 | 1000 | 910742 |
News | 196.28 | 1000 | 1039438 |
159.36 | 1000 | 2437786 |
# Calculate totals
total_size <- sum(files_info$Size_MB, na.rm = TRUE)
total_lines <- sum(files_info$Est_Total_Lines, na.rm = TRUE)
cat(paste("Total dataset size:", total_size, "MB\n"))
## Total dataset size: 556.06 MB
## Estimated total lines: 4,387,966
# Function to load and clean sample data
load_sample_data <- function(file_path, sample_rate = 0.01) {
if (file.exists(file_path)) {
# Read a sample of lines
con <- file(file_path, "r")
all_lines <- readLines(con, warn = FALSE)
close(con)
# Sample data for analysis
set.seed(123)
sample_size <- min(length(all_lines), max(1000, round(length(all_lines) * sample_rate)))
sample_lines <- sample(all_lines, sample_size)
# Basic cleaning
clean_lines <- tolower(sample_lines)
clean_lines <- gsub("[^a-z\\s']", " ", clean_lines)
clean_lines <- gsub("\\s+", " ", clean_lines)
clean_lines <- trimws(clean_lines)
clean_lines <- clean_lines[nzchar(clean_lines)]
return(clean_lines)
} else {
# Return sample data if files don't exist
return(c(
"this is a sample blog post about data science",
"the weather today is very nice and sunny",
"i am working on a machine learning project",
"natural language processing is fascinating",
"text mining helps us understand patterns"
))
}
}
# Load sample data from each source
blogs_sample <- load_sample_data("en_US/en_US.blogs.txt")
news_sample <- load_sample_data("en_US/en_US.news.txt")
twitter_sample <- load_sample_data("en_US/en_US.twitter.txt")
cat("Sample sizes loaded:\n")
## Sample sizes loaded:
## Blogs: 8988 lines
## News: 10095 lines
## Twitter: 23601 lines
# Function to analyze text
analyze_text <- function(text_lines, source_name) {
# Combine all lines
all_text <- paste(text_lines, collapse = " ")
# Split into words
words <- unlist(strsplit(all_text, "\\s+"))
words <- words[nzchar(words)]
# Calculate statistics
stats <- data.frame(
Source = source_name,
Total_Words = length(words),
Unique_Words = length(unique(words)),
Avg_Words_Per_Line = round(length(words) / length(text_lines), 1),
Avg_Chars_Per_Word = round(mean(nchar(words)), 1),
stringsAsFactors = FALSE
)
return(list(stats = stats, words = words))
}
# Analyze each dataset
blogs_analysis <- analyze_text(blogs_sample, "Blogs")
news_analysis <- analyze_text(news_sample, "News")
twitter_analysis <- analyze_text(twitter_sample, "Twitter")
# Combine statistics
word_stats <- rbind(blogs_analysis$stats, news_analysis$stats, twitter_analysis$stats)
kable(word_stats,
caption = "Word Count Statistics by Data Source")
Source | Total_Words | Unique_Words | Avg_Words_Per_Line | Avg_Chars_Per_Word |
---|---|---|---|---|
Blogs | 374624 | 27767 | 41.7 | 4.4 |
News | 340749 | 29134 | 33.8 | 4.7 |
299430 | 24669 | 12.7 | 4.2 |
# Calculate line lengths
blogs_lengths <- nchar(blogs_sample)
news_lengths <- nchar(news_sample)
twitter_lengths <- nchar(twitter_sample)
# Create combined dataset for plotting
length_data <- data.frame(
Length = c(blogs_lengths, news_lengths, twitter_lengths),
Source = c(rep("Blogs", length(blogs_lengths)),
rep("News", length(news_lengths)),
rep("Twitter", length(twitter_lengths)))
)
# Plot distribution
ggplot(length_data, aes(x = Length, fill = Source)) +
geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
facet_wrap(~Source, scales = "free_y") +
labs(title = "Distribution of Text Length by Source",
x = "Characters per Line",
y = "Frequency") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Set2")
# Combine all words
all_words <- c(blogs_analysis$words, news_analysis$words, twitter_analysis$words)
# Calculate word frequencies
word_freq <- table(all_words)
word_freq_df <- data.frame(
Word = names(word_freq),
Frequency = as.numeric(word_freq),
stringsAsFactors = FALSE
) %>%
arrange(desc(Frequency))
# Top 20 most common words
top_words <- head(word_freq_df, 20)
kable(head(top_words, 10),
caption = "Top 10 Most Frequent Words")
Word | Frequency |
---|---|
the | 47786 |
to | 27525 |
and | 24461 |
a | 24119 |
of | 20165 |
i | 17418 |
in | 16793 |
for | 11199 |
that | 10621 |
is | 10450 |
# Plot top words
ggplot(top_words, aes(x = reorder(Word, Frequency), y = Frequency)) +
geom_col(fill = "steelblue", alpha = 0.8) +
coord_flip() +
labs(title = "Top 20 Most Frequent Words",
x = "Word",
y = "Frequency") +
theme_minimal()
# Function to create bigrams
create_bigrams <- function(words) {
if (length(words) < 2) return(character(0))
bigrams <- character(length(words) - 1)
for (i in 1:(length(words) - 1)) {
bigrams[i] <- paste(words[i], words[i + 1])
}
return(bigrams)
}
# Create bigrams from all text
all_bigrams <- create_bigrams(all_words)
bigram_freq <- table(all_bigrams)
bigram_df <- data.frame(
Bigram = names(bigram_freq),
Frequency = as.numeric(bigram_freq),
stringsAsFactors = FALSE
) %>%
arrange(desc(Frequency)) %>%
head(15)
kable(head(bigram_df, 10),
caption = "Top 10 Most Frequent Bigrams")
Bigram | Frequency |
---|---|
in the | 4330 |
of the | 4324 |
to the | 2132 |
for the | 2107 |
on the | 1945 |
to be | 1623 |
at the | 1463 |
and the | 1251 |
in a | 1166 |
with the | 1051 |
# Plot bigrams
ggplot(bigram_df, aes(x = reorder(Bigram, Frequency), y = Frequency)) +
geom_col(fill = "darkgreen", alpha = 0.8) +
coord_flip() +
labs(title = "Top 15 Most Frequent Bigrams",
x = "Bigram",
y = "Frequency") +
theme_minimal()
# Calculate vocabulary richness (unique words / total words)
richness_data <- data.frame(
Source = word_stats$Source,
Vocabulary_Richness = round(word_stats$Unique_Words / word_stats$Total_Words, 3),
stringsAsFactors = FALSE
)
kable(richness_data,
caption = "Vocabulary Richness by Source")
Source | Vocabulary_Richness |
---|---|
Blogs | 0.074 |
News | 0.085 |
0.082 |
# Plot vocabulary richness
ggplot(richness_data, aes(x = Source, y = Vocabulary_Richness, fill = Source)) +
geom_col(alpha = 0.8) +
labs(title = "Vocabulary Richness by Data Source",
y = "Vocabulary Richness (Unique/Total)",
x = "Data Source") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Set1") +
theme(legend.position = "none")
# Summary statistics for line lengths
length_summary <- length_data %>%
group_by(Source) %>%
summarise(
Min = min(Length),
Q1 = quantile(Length, 0.25),
Median = median(Length),
Q3 = quantile(Length, 0.75),
Max = max(Length),
Mean = round(mean(Length), 1)
)
kable(length_summary,
caption = "Line Length Statistics by Source")
Source | Min | Q1 | Median | Q3 | Max | Mean |
---|---|---|---|---|---|---|
Blogs | 1 | 42 | 150 | 321 | 4904 | 222.0 |
News | 1 | 104 | 178 | 259 | 1684 | 192.6 |
3 | 34 | 61 | 95 | 140 | 65.2 |
# Box plot of line lengths
ggplot(length_data, aes(x = Source, y = Length, fill = Source)) +
geom_boxplot(alpha = 0.8) +
labs(title = "Distribution of Line Lengths by Source",
y = "Characters per Line",
x = "Data Source") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Set1") +
theme(legend.position = "none")
Based on our exploratory analysis, we’ve identified several key characteristics:
These findings inform our modeling approach:
The analysis reveals: - High-quality text data suitable for language modeling - Sufficient vocabulary diversity for robust predictions - Natural language patterns consistent across sources - Manageable data size for efficient processing
Based on this exploratory analysis, the next steps for building the prediction model include:
This report provides a comprehensive exploratory analysis of the text mining dataset for next word prediction. The findings support the development of an effective prediction algorithm that can provide contextually relevant word suggestions.