1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
#load data  
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Data Preprocessing

#clean
clean_text <- function(text) {
  # Remove non-English characters
  text <- stringi::stri_trans_general(text, "Latin-ASCII")
  # Convert to lowercase
  text <- tolower(text)
  # Remove anything that isn't a letter, number, or common punctuation
  text <- gsub("[^a-z0-9[:punct:][:space:]]", "", text)
  # Remove extra spaces
  text <- gsub("\\s+", " ", text)
  # Trim leading and trailing spaces
  text <- trimws(text)
  return(text)
}
 
blogs_clean <- sapply(blogs, clean_text)
news_clean <- sapply(news, clean_text)
twitter_clean <- sapply(twitter, clean_text)

# Remove all non-English characters, 2%
set.seed(123) 
blogs_sample <- sample(blogs_clean, length(blogs_clean) * 0.02)
news_sample <- sample(news_clean, length(news_clean) * 0.02)
twitter_sample <- sample(twitter_clean, length(twitter_clean) * 0.02)

# Combine into a single data frame
sample_data <- data.frame(
  Text = c(blogs_sample, news_sample, twitter_sample),
  Source = c(rep("Blog", length(blogs_sample)), rep("News", length(news_sample)), rep("Twitter", length(twitter_sample)))
)
#n grams fxn
generate_ngrams <- function(text, n) {
  # Tokenize input text to n-grams
  tokens <- tokenize_ngrams(text, n = n, n_min = n)
  
  # Flatten the list of tokens into a single vector
  tokens_flat <- unlist(tokens, use.names = FALSE)
  
  # Create an iterator over the tokens
  it <- itoken(tokens_flat)
  
  # Create vocabulary and vectorizer
  vocab <- create_vocabulary(it)
  vectorizer <- vocab_vectorizer(vocab)
  
  # Create the document-term matrix (DTM)
  dtm <- create_dtm(it, vectorizer)
  
  return(list(dtm = dtm, vocab = vocab))
}

# use fxn -bigram
bigrams_result <- generate_ngrams(sample_data$Text, 2)
bigrams_dtm <- bigrams_result$dtm
bigrams_vocab <- bigrams_result$vocab
  1. Create a basic report of summary statistics about the data sets.
# Calculate statistics
stats <- data.frame(
  Source = c("Blogs", "News", "Twitter"),
  Total_Entries = c(length(blogs), length(news), length(twitter)),
  Average_Length = c(mean(nchar(blogs)), mean(nchar(news)), mean(nchar(twitter)))
)

# Display the statistics
knitr::kable(stats, caption = "Basic Summary Statistics for Each Dataset")
Basic Summary Statistics for Each Dataset
Source Total_Entries Average_Length
Blogs 899288 229.98695
News 1010242 201.16285
Twitter 2360148 68.68045
#total entries
ggplot(stats, aes(x = Source, y = Total_Entries, fill = Source)) +
  geom_bar(stat = "identity", color = "black") +
  theme_minimal() +
  labs(title = "Total Entries by Source",
       x = "Source",
       y = "Total Entries") +
  scale_fill_brewer(palette = "Pastel1") 

#avg length
ggplot(stats, aes(x = Source, y = Average_Length, fill = Source)) +
  geom_bar(stat = "identity", color = "black") +
  theme_minimal() +
  labs(title = "Average Length of Entries by Source",
       x = "Source",
       y = "Average Length of Entries") +
  scale_fill_brewer(palette = "Pastel2") 

3. Report any interesting findings that you amassed so far. Tweets tend to be much shorter than blog entries and news articles. We can highlight the most frequent words or phrases that might be of interest or concern.

Future Plans for Prediction Algorithm and Shiny App

We plan to use logistic regression/machine learning techniques to predict the likelihood of a text being a blog, news, or tweet based on its content and structure. The app will allow users to input text and receive a prediction of the category it fits into. It will also provide some visualization of the text statistics compared to the trained model.