---
title: "milestone"
author: "Taiki Matsugi"
date: "2025-04-02"
output: 
  html_document:
    toc: true
    toc_float: true
    theme: united
---

## Introduction

This report presents an exploratory analysis of English text data collected from three different sources (blogs, news, and Twitter). This analysis serves as a foundation for developing a text prediction algorithm.

## Data Loading and Basic Statistics


``` r
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.4     âś” readr     2.1.5
## âś” forcats   1.0.0     âś” stringr   1.5.1
## âś” ggplot2   3.5.1     âś” tibble    3.2.1
## âś” lubridate 1.9.3     âś” tidyr     1.3.1
## âś” purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

# Set data file paths
blogs_file <- "./data/en_US/en_US.blogs.txt"
news_file <- "./data/en_US/en_US.news.txt"
twitter_file <- "./data/en_US/en_US.twitter.txt"

# Define a safe_readLines function to handle potential errors
safe_readLines <- function(file_path) {
  tryCatch({
    readLines(file_path, encoding = "UTF-8", skipNul = TRUE)
  }, error = function(e) {
    warning(paste("Error reading file:", file_path, "\n", e$message))
    NULL  # Return NULL if there's an error
  })
}


# Load data (with sampling)
set.seed(123) # For reproducibility
sample_size <- 10000 # Sampling to manage memory usage

blogs_text <- safe_readLines(blogs_file)
news_text <- safe_readLines(news_file)
twitter_text <- safe_readLines(twitter_file)

# Sampling
if (!is.null(blogs_text)) blogs_text <- sample(blogs_text, min(length(blogs_text), sample_size))
if (!is.null(news_text)) news_text <- sample(news_text, min(length(news_text), sample_size))
if (!is.null(twitter_text)) twitter_text <- sample(twitter_text, min(length(twitter_text), sample_size))

Basic Summary Statistics

# Function to calculate basic statistics for each source
calculate_stats <- function(text, source_name) {
  if (is.null(text)) {
    return(data.frame(
      Source = source_name,
      Lines = 0,
      TotalWords = 0,
      UniqueWords = 0,
      AvgWordsPerLine = 0,
      MaxWordsPerLine = 0
    ))
  }
  
  words <- unlist(strsplit(tolower(paste(text, collapse = " ")), "\\s+"))
  data.frame(
    Source = source_name,
    Lines = length(text),
    TotalWords = length(words),
    UniqueWords = length(unique(words)),
    AvgWordsPerLine = mean(str_count(text, "\\S+")),
    MaxWordsPerLine = max(str_count(text, "\\S+"))
  )
}

# Calculate statistics
stats_df <- rbind(
  calculate_stats(blogs_text, "Blogs"),
  calculate_stats(news_text, "News"),
  calculate_stats(twitter_text, "Twitter")
)

# Display results
knitr::kable(stats_df, 
             caption = "Basic Statistics by Data Source",
             digits = 2,
             format.args = list(big.mark = ","))
Basic Statistics by Data Source
Source Lines TotalWords UniqueWords AvgWordsPerLine MaxWordsPerLine
Blogs 10,000 414,435 52,851 41.44 808
News 10,000 341,001 50,539 34.10 265
Twitter 10,000 129,029 24,943 12.90 35

Data Visualization

Sentence Length Distribution

# Create dataframe of sentence lengths
get_sentence_lengths <- function(text, source) {
  if (is.null(text)) return(data.frame(length = numeric(0), source = character(0)))
  data.frame(
    length = str_count(text, "\\S+"),
    source = source
  )
}

# Combine sentence lengths from all sources
sentence_lengths <- rbind(
  get_sentence_lengths(blogs_text, "Blogs"),
  get_sentence_lengths(news_text, "News"),
  get_sentence_lengths(twitter_text, "Twitter")
)

# Create histogram
if (nrow(sentence_lengths) > 0) {
  print(ggplot(sentence_lengths, aes(x = length, fill = source)) +
    geom_histogram(binwidth = 5, position = "dodge", alpha = 0.7) +
    facet_wrap(~source, scales = "free_y") +
    labs(title = "Distribution of Sentence Lengths by Source",
         x = "Number of Words",
         y = "Frequency") +
    theme_minimal() +
    scale_fill_brewer(palette = "Set2") +
    theme(
      axis.text = element_text(size = 10),
      axis.title = element_text(size = 12),
      plot.title = element_text(size = 14, hjust = 0.5)
    ))
}

Frequent Word Analysis

# Function to create word clouds
create_wordcloud <- function(text, title) {
  if (is.null(text) || length(text) == 0) {
    plot.new()
    title(main = paste(title, "(No Data)"))
    return()
  }
  
  # Create and preprocess corpus
  corpus <- Corpus(VectorSource(text))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace) # Remove extra whitespace

  # Calculate word frequencies
  tdm <- TermDocumentMatrix(corpus)
  word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  
  # Create word cloud
  wordcloud(words = names(word_freq), 
           freq = word_freq,
           max.words = 100,
           random.order = FALSE,
           colors = brewer.pal(8, "Dark2"),
           scale = c(3, 0.5), # Adjust scale for better readability
           main = title)
}

# Display three word clouds in one figure
par(mfrow = c(1, 3), mar = c(2, 2, 2, 2))
create_wordcloud(blogs_text, "Blogs")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
create_wordcloud(news_text, "News")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
create_wordcloud(twitter_text, "Twitter")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

Prediction Algorithm and Shiny App Plan

Prediction Algorithm Design

  1. N-gram Model Construction
    • Create Unigram, Bigram, and Trigram models
    • Calculate frequency-based probabilities
    • Implement backoff model
  2. Model Optimization
    • Implement Katz’s backoff algorithm
    • Apply Good-Turing smoothing
    • Evaluate and tune model performance

Shiny App Features

  1. User Interface
    • Simple text input field
    • Real-time word prediction display
    • Alternative predictions display (top 3-5 candidates)
  2. Visualization Features
    • Prediction probability bar charts
    • N-gram analysis of input text
    • Prediction confidence scores

Next Steps

  1. Data Cleaning and Tokenization
  2. N-gram Model Implementation and Evaluation
  3. Shiny App Prototype Development
  4. User Feedback-based Improvements ```