Milestone Report: Next Word Prediction

Data Loading and Summary Statistics

# Load data (using extracted ZIP path)
blogs <- readLines("final/en_US/en_US.blogs.txt",
                   encoding = "UTF-8", skipNul = TRUE, n = 5000)

news <- readLines("final/en_US/en_US.news.txt",
                  encoding = "UTF-8", skipNul = TRUE, n = 5000)

twitter <- readLines("final/en_US/en_US.twitter.txt",
                     encoding = "UTF-8", skipNul = TRUE, n = 5000)

# Function to count words
word_count <- function(text) {
  sum(sapply(gregexpr("\\S+", text), length))
}

# Summary table
summary_table <- data.frame(
  Dataset = c("Blogs", "News", "Twitter"),
  Line_Count = c(length(blogs), length(news), length(twitter)),
  Word_Count = c(
    word_count(blogs),
    word_count(news),
    word_count(twitter)
  )
)

# Display summary
summary_table

##   Dataset Line_Count Word_Count
## 1   Blogs       5000     205555
## 2    News       5000     170940
## 3 Twitter       5000      63747

set.seed(123)

sample_blogs <- sample(blogs, 1000)
sample_news <- sample(news, 1000)
sample_twitter <- sample(twitter, 1000)

hist(nchar(sample_blogs),
col = "lightblue",
main = "Blog Line Lengths",
xlab = "Characters")

hist(nchar(sample_news),
col = "lightgreen",
main = "News Line Lengths",
xlab = "Characters")

hist(nchar(sample_twitter),
col = "lightpink",
main = "Twitter Line Lengths",
xlab = "Characters")

Milestone Report: Next Word Prediction

Chinmayi N

2025-12-29

Introduction

Data Loading and Summary Statistics