Introduction

This project aims to build a text prediction system that predicts the next word based on user input. This milestone report demonstrates that the dataset has been successfully loaded and explored.

The dataset consists of text collected from blogs, news articles, and Twitter.


Data Loading and Summary Statistics

# Load data (using extracted ZIP path)
blogs <- readLines("final/en_US/en_US.blogs.txt",
                   encoding = "UTF-8", skipNul = TRUE, n = 5000)

news <- readLines("final/en_US/en_US.news.txt",
                  encoding = "UTF-8", skipNul = TRUE, n = 5000)

twitter <- readLines("final/en_US/en_US.twitter.txt",
                     encoding = "UTF-8", skipNul = TRUE, n = 5000)

# Function to count words
word_count <- function(text) {
  sum(sapply(gregexpr("\\S+", text), length))
}

# Summary table
summary_table <- data.frame(
  Dataset = c("Blogs", "News", "Twitter"),
  Line_Count = c(length(blogs), length(news), length(twitter)),
  Word_Count = c(
    word_count(blogs),
    word_count(news),
    word_count(twitter)
  )
)

# Display summary
summary_table
##   Dataset Line_Count Word_Count
## 1   Blogs       5000     205555
## 2    News       5000     170940
## 3 Twitter       5000      63747
set.seed(123)

sample_blogs <- sample(blogs, 1000)
sample_news <- sample(news, 1000)
sample_twitter <- sample(twitter, 1000)

hist(nchar(sample_blogs),
col = "lightblue",
main = "Blog Line Lengths",
xlab = "Characters")

hist(nchar(sample_news),
col = "lightgreen",
main = "News Line Lengths",
xlab = "Characters")

hist(nchar(sample_twitter),
col = "lightpink",
main = "Twitter Line Lengths",
xlab = "Characters")