This project aims to build a text prediction system that predicts the next word based on user input. This milestone report demonstrates that the dataset has been successfully loaded and explored.
The dataset consists of text collected from blogs, news articles, and Twitter.
# Load data (using extracted ZIP path)
blogs <- readLines("final/en_US/en_US.blogs.txt",
encoding = "UTF-8", skipNul = TRUE, n = 5000)
news <- readLines("final/en_US/en_US.news.txt",
encoding = "UTF-8", skipNul = TRUE, n = 5000)
twitter <- readLines("final/en_US/en_US.twitter.txt",
encoding = "UTF-8", skipNul = TRUE, n = 5000)
# Function to count words
word_count <- function(text) {
sum(sapply(gregexpr("\\S+", text), length))
}
# Summary table
summary_table <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Line_Count = c(length(blogs), length(news), length(twitter)),
Word_Count = c(
word_count(blogs),
word_count(news),
word_count(twitter)
)
)
# Display summary
summary_table
## Dataset Line_Count Word_Count
## 1 Blogs 5000 205555
## 2 News 5000 170940
## 3 Twitter 5000 63747
set.seed(123)
sample_blogs <- sample(blogs, 1000)
sample_news <- sample(news, 1000)
sample_twitter <- sample(twitter, 1000)
hist(nchar(sample_blogs),
col = "lightblue",
main = "Blog Line Lengths",
xlab = "Characters")
hist(nchar(sample_news),
col = "lightgreen",
main = "News Line Lengths",
xlab = "Characters")
hist(nchar(sample_twitter),
col = "lightpink",
main = "Twitter Line Lengths",
xlab = "Characters")