milestone.knit

---
title: "milestone"
author: "Taiki Matsugi"
date: "2025-04-02"
output: 
  html_document:
    toc: true
    toc_float: true
    theme: united
---

## Introduction

This report presents an exploratory analysis of English text data collected from three different sources (blogs, news, and Twitter). This analysis serves as a foundation for developing a text prediction algorithm.

## Data Loading and Basic Statistics


``` r
# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)
library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

# Set data file paths
blogs_file <- "./data/en_US/en_US.blogs.txt"
news_file <- "./data/en_US/en_US.news.txt"
twitter_file <- "./data/en_US/en_US.twitter.txt"

# Define a safe_readLines function to handle potential errors
safe_readLines <- function(file_path) {
  tryCatch({
    readLines(file_path, encoding = "UTF-8", skipNul = TRUE)
  }, error = function(e) {
    warning(paste("Error reading file:", file_path, "\n", e$message))
    NULL  # Return NULL if there's an error
  })
}


# Load data (with sampling)
set.seed(123) # For reproducibility
sample_size <- 10000 # Sampling to manage memory usage

blogs_text <- safe_readLines(blogs_file)
news_text <- safe_readLines(news_file)
twitter_text <- safe_readLines(twitter_file)

# Sampling
if (!is.null(blogs_text)) blogs_text <- sample(blogs_text, min(length(blogs_text), sample_size))
if (!is.null(news_text)) news_text <- sample(news_text, min(length(news_text), sample_size))
if (!is.null(twitter_text)) twitter_text <- sample(twitter_text, min(length(twitter_text), sample_size))

Basic Summary Statistics

# Function to calculate basic statistics for each source
calculate_stats <- function(text, source_name) {
  if (is.null(text)) {
    return(data.frame(
      Source = source_name,
      Lines = 0,
      TotalWords = 0,
      UniqueWords = 0,
      AvgWordsPerLine = 0,
      MaxWordsPerLine = 0
    ))
  }
  
  words <- unlist(strsplit(tolower(paste(text, collapse = " ")), "\\s+"))
  data.frame(
    Source = source_name,
    Lines = length(text),
    TotalWords = length(words),
    UniqueWords = length(unique(words)),
    AvgWordsPerLine = mean(str_count(text, "\\S+")),
    MaxWordsPerLine = max(str_count(text, "\\S+"))
  )
}

# Calculate statistics
stats_df <- rbind(
  calculate_stats(blogs_text, "Blogs"),
  calculate_stats(news_text, "News"),
  calculate_stats(twitter_text, "Twitter")
)

# Display results
knitr::kable(stats_df, 
             caption = "Basic Statistics by Data Source",
             digits = 2,
             format.args = list(big.mark = ","))

Basic Statistics by Data Source
Source	Lines	TotalWords	UniqueWords	AvgWordsPerLine	MaxWordsPerLine
Blogs	10,000	414,435	52,851	41.44	808
News	10,000	341,001	50,539	34.10	265
Twitter	10,000	129,029	24,943	12.90	35

Data Visualization

Sentence Length Distribution

# Create dataframe of sentence lengths
get_sentence_lengths <- function(text, source) {
  if (is.null(text)) return(data.frame(length = numeric(0), source = character(0)))
  data.frame(
    length = str_count(text, "\\S+"),
    source = source
  )
}

# Combine sentence lengths from all sources
sentence_lengths <- rbind(
  get_sentence_lengths(blogs_text, "Blogs"),
  get_sentence_lengths(news_text, "News"),
  get_sentence_lengths(twitter_text, "Twitter")
)

# Create histogram
if (nrow(sentence_lengths) > 0) {
  print(ggplot(sentence_lengths, aes(x = length, fill = source)) +
    geom_histogram(binwidth = 5, position = "dodge", alpha = 0.7) +
    facet_wrap(~source, scales = "free_y") +
    labs(title = "Distribution of Sentence Lengths by Source",
         x = "Number of Words",
         y = "Frequency") +
    theme_minimal() +
    scale_fill_brewer(palette = "Set2") +
    theme(
      axis.text = element_text(size = 10),
      axis.title = element_text(size = 12),
      plot.title = element_text(size = 14, hjust = 0.5)
    ))
}

Frequent Word Analysis

# Function to create word clouds
create_wordcloud <- function(text, title) {
  if (is.null(text) || length(text) == 0) {
    plot.new()
    title(main = paste(title, "(No Data)"))
    return()
  }
  
  # Create and preprocess corpus
  corpus <- Corpus(VectorSource(text))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace) # Remove extra whitespace

  # Calculate word frequencies
  tdm <- TermDocumentMatrix(corpus)
  word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  
  # Create word cloud
  wordcloud(words = names(word_freq), 
           freq = word_freq,
           max.words = 100,
           random.order = FALSE,
           colors = brewer.pal(8, "Dark2"),
           scale = c(3, 0.5), # Adjust scale for better readability
           main = title)
}

# Display three word clouds in one figure
par(mfrow = c(1, 3), mar = c(2, 2, 2, 2))
create_wordcloud(blogs_text, "Blogs")

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

create_wordcloud(news_text, "News")

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

create_wordcloud(twitter_text, "Twitter")

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

Prediction Algorithm and Shiny App Plan

Prediction Algorithm Design

N-gram Model Construction
- Create Unigram, Bigram, and Trigram models
- Calculate frequency-based probabilities
- Implement backoff model
Model Optimization
- Implement Katz’s backoff algorithm
- Apply Good-Turing smoothing
- Evaluate and tune model performance

Shiny App Features

User Interface
- Simple text input field
- Real-time word prediction display
- Alternative predictions display (top 3-5 candidates)
Visualization Features
- Prediction probability bar charts
- N-gram analysis of input text
- Prediction confidence scores

Next Steps

Data Cleaning and Tokenization
N-gram Model Implementation and Evaluation
Shiny App Prototype Development
User Feedback-based Improvements ```