#load data
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Data Preprocessing
#clean
clean_text <- function(text) {
# Remove non-English characters
text <- stringi::stri_trans_general(text, "Latin-ASCII")
# Convert to lowercase
text <- tolower(text)
# Remove anything that isn't a letter, number, or common punctuation
text <- gsub("[^a-z0-9[:punct:][:space:]]", "", text)
# Remove extra spaces
text <- gsub("\\s+", " ", text)
# Trim leading and trailing spaces
text <- trimws(text)
return(text)
}
blogs_clean <- sapply(blogs, clean_text)
news_clean <- sapply(news, clean_text)
twitter_clean <- sapply(twitter, clean_text)
# Remove all non-English characters, 2%
set.seed(123)
blogs_sample <- sample(blogs_clean, length(blogs_clean) * 0.02)
news_sample <- sample(news_clean, length(news_clean) * 0.02)
twitter_sample <- sample(twitter_clean, length(twitter_clean) * 0.02)
# Combine into a single data frame
sample_data <- data.frame(
Text = c(blogs_sample, news_sample, twitter_sample),
Source = c(rep("Blog", length(blogs_sample)), rep("News", length(news_sample)), rep("Twitter", length(twitter_sample)))
)
#n grams fxn
generate_ngrams <- function(text, n) {
# Tokenize input text to n-grams
tokens <- tokenize_ngrams(text, n = n, n_min = n)
# Flatten the list of tokens into a single vector
tokens_flat <- unlist(tokens, use.names = FALSE)
# Create an iterator over the tokens
it <- itoken(tokens_flat)
# Create vocabulary and vectorizer
vocab <- create_vocabulary(it)
vectorizer <- vocab_vectorizer(vocab)
# Create the document-term matrix (DTM)
dtm <- create_dtm(it, vectorizer)
return(list(dtm = dtm, vocab = vocab))
}
# use fxn -bigram
bigrams_result <- generate_ngrams(sample_data$Text, 2)
bigrams_dtm <- bigrams_result$dtm
bigrams_vocab <- bigrams_result$vocab
# Calculate statistics
stats <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Total_Entries = c(length(blogs), length(news), length(twitter)),
Average_Length = c(mean(nchar(blogs)), mean(nchar(news)), mean(nchar(twitter)))
)
# Display the statistics
knitr::kable(stats, caption = "Basic Summary Statistics for Each Dataset")
| Source | Total_Entries | Average_Length |
|---|---|---|
| Blogs | 899288 | 229.98695 |
| News | 1010242 | 201.16285 |
| 2360148 | 68.68045 |
#total entries
ggplot(stats, aes(x = Source, y = Total_Entries, fill = Source)) +
geom_bar(stat = "identity", color = "black") +
theme_minimal() +
labs(title = "Total Entries by Source",
x = "Source",
y = "Total Entries") +
scale_fill_brewer(palette = "Pastel1")
#avg length
ggplot(stats, aes(x = Source, y = Average_Length, fill = Source)) +
geom_bar(stat = "identity", color = "black") +
theme_minimal() +
labs(title = "Average Length of Entries by Source",
x = "Source",
y = "Average Length of Entries") +
scale_fill_brewer(palette = "Pastel2")
3. Report any interesting findings that you amassed so far. Tweets tend
to be much shorter than blog entries and news articles. We can highlight
the most frequent words or phrases that might be of interest or
concern.
Future Plans for Prediction Algorithm and Shiny App
We plan to use logistic regression/machine learning techniques to predict the likelihood of a text being a blog, news, or tweet based on its content and structure. The app will allow users to input text and receive a prediction of the category it fits into. It will also provide some visualization of the text statistics compared to the trained model.