Introduction

This report summarizes the exploratory analysis of three English-language text datasets:

library(data.table)
library(stringr)
library(ggplot2)
library(gridExtra)
# File names
blogs_file   <- "en_US.blogs.txt"
news_file    <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"

# Sample lines to save memory
sample_size <- 10000
set.seed(123)

blogs_sample   <- sample(readLines(blogs_file, warn=FALSE), sample_size)
news_sample    <- sample(readLines(news_file, warn=FALSE), sample_size)
twitter_sample <- sample(readLines(twitter_file, warn=FALSE), sample_size)
# Function to summarize dataset
get_summary <- function(lines) {
  list(
    num_lines = length(lines),
    avg_words_per_line = mean(str_count(lines, "\\S+")),
    avg_chars_per_line = mean(nchar(lines)),
    max_line_length = max(nchar(lines))
  )
}

blogs_summary   <- get_summary(blogs_sample)
news_summary    <- get_summary(news_sample)
twitter_summary <- get_summary(twitter_sample)

summary_table <- data.table(
  Dataset = c("Blogs", "News", "Twitter"),
  Lines = c(blogs_summary$num_lines, news_summary$num_lines, twitter_summary$num_lines),
  Avg_Words = c(blogs_summary$avg_words_per_line, news_summary$avg_words_per_line, twitter_summary$avg_words_per_line),
  Avg_Chars = c(blogs_summary$avg_chars_per_line, news_summary$avg_chars_per_line, twitter_summary$avg_chars_per_line),
  Max_Line = c(blogs_summary$max_line_length, news_summary$max_line_length, twitter_summary$max_line_length)
)

summary_table
##    Dataset Lines Avg_Words Avg_Chars Max_Line
##     <char> <int>     <num>     <num>    <int>
## 1:   Blogs 10000   41.4435  229.7303     5054
## 2:    News 10000   34.1001  201.5949     1840
## 3: Twitter 10000   12.9029   68.7301      140
# Histogram function
plot_line_lengths <- function(lines, title) {
  line_lengths <- nchar(lines)
  ggplot(data.frame(line_lengths), aes(x=line_lengths)) +
    geom_histogram(binwidth=10, fill="skyblue", color="black") +
    theme_minimal() +
    labs(title=title, x="Line length (characters)", y="Frequency")
}

p1 <- plot_line_lengths(blogs_sample, "Blogs Line Lengths")
p2 <- plot_line_lengths(news_sample, "News Line Lengths")
p3 <- plot_line_lengths(twitter_sample, "Twitter Line Lengths")

grid.arrange(p1, p2, p3, ncol=3)

# Tokenization
tokenize <- function(lines) {
  unlist(str_split(tolower(lines), "\\s+"))
}

blogs_tokens   <- tokenize(blogs_sample)
news_tokens    <- tokenize(news_sample)
twitter_tokens <- tokenize(twitter_sample)

# Top 10 words function
top_words <- function(tokens, n=10) {
  dt <- data.table(word=tokens)
  dt <- dt[, .N, by=word][order(-N)]
  dt[1:n]
}

blogs_top10   <- top_words(blogs_tokens)
news_top10    <- top_words(news_tokens)
twitter_top10 <- top_words(twitter_tokens)

blogs_top10
##       word     N
##     <char> <int>
##  1:    the 20378
##  2:    and 12010
##  3:     to 11756
##  4:      a  9731
##  5:     of  9520
##  6:      i  8378
##  7:     in  6499
##  8:   that  4927
##  9:     is  4449
## 10:    for  4002
news_top10
##       word     N
##     <char> <int>
##  1:    the 19137
##  2:     to  8986
##  3:    and  8756
##  4:      a  8561
##  5:     of  7754
##  6:     in  6561
##  7:    for  3440
##  8:   that  3300
##  9:     is  2864
## 10:     on  2585
twitter_top10
##       word     N
##     <char> <int>
##  1:    the  4042
##  2:     to  3257
##  3:      i  3048
##  4:      a  2544
##  5:    you  2054
##  6:    and  1802
##  7:     in  1620
##  8:    for  1587
##  9:     of  1498
## 10:     is  1444

Observations

Goals for Prediction Algorithm and Shiny App