Exploratory Analysis of Text Data for AI Text Prediction

Introduction

This report demonstrates that the SwiftKey English text data has been downloaded, loaded, summarized, and visualized, and outlines a concise plan for a next-word prediction algorithm and Shiny app.

##Download & List Files

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, destfile = "Coursera-SwiftKey.zip", mode = "wb")
unzip("Coursera-SwiftKey.zip")
}
data_dir <- "final/en_US"
files <- tibble(
Dataset = c("Twitter", "Blogs", "News"),
Path = file.path(data_dir, c("en_US.twitter.txt","en_US.blogs.txt","en_US.news.txt"))
)
files

## # A tibble: 3 × 2
##   Dataset Path                         
##   <chr>   <chr>                        
## 1 Twitter final/en_US/en_US.twitter.txt
## 2 Blogs   final/en_US/en_US.blogs.txt  
## 3 News    final/en_US/en_US.news.txt

##Summary Statistics

# Function to count words efficiently
word_count_file <- function(fp, chunk = 100000){
con <- file(fp, open = "r"); on.exit(close(con))
total <- 0L
repeat{
x <- readLines(con, n = chunk, warn = FALSE, skipNul = TRUE)
if (length(x) == 0) break
total <- total + sum(stringr::str_count(x, "\\S+"))
}
total
}

# Generate summary statistics
summaries <- files %>%
rowwise() %>%
mutate(
Size_MB = round(file.info(Path)$size / (1024^2), 2),
Lines = R.utils::countLines(Path),
Words = word_count_file(Path),
Avg_Words_Per_Line = round(Words / Lines, 2)
) %>%
ungroup()


summaries

## # A tibble: 3 × 6
##   Dataset Path                          Size_MB  Lines  Words Avg_Words_Per_Line
##   <chr>   <chr>                           <dbl>  <int>  <int>              <dbl>
## 1 Twitter final/en_US/en_US.twitter.txt    159. 2.36e6 3.04e7               12.9
## 2 Blogs   final/en_US/en_US.blogs.txt      200. 8.99e5 3.73e7               41.5
## 3 News    final/en_US/en_US.news.txt       196. 1.01e6 3.44e7               34.0

##Sample & Clean

read_sample <- function(fp, max_lines = 50000){
con <- file(fp, "r"); on.exit(close(con))
tibble(text = readLines(con, n = max_lines, warn = FALSE, skipNul = TRUE))
}
# Apply sampling
sampled <- files %>% mutate(Data = lapply(Path, read_sample))
sampled

## # A tibble: 3 × 3
##   Dataset Path                          Data                 
##   <chr>   <chr>                         <list>               
## 1 Twitter final/en_US/en_US.twitter.txt <tibble [50,000 × 1]>
## 2 Blogs   final/en_US/en_US.blogs.txt   <tibble [50,000 × 1]>
## 3 News    final/en_US/en_US.news.txt    <tibble [50,000 × 1]>

##Cleaning & tokenization helpers

data("stop_words")

clean_text <- function(text){
  text %>%
    str_to_lower() %>%
    str_replace_all("https?://\\S+|www\\.\\S+", " ") %>%
    str_replace_all("@\\w+|[^a-z'\\s]", " ") %>%
    str_squish()
}

tokenize_words <- function(df){
  df %>%
    mutate(text = clean_text(text)) %>%
    unnest_tokens(word, text) %>%
    filter(!word %in% stop_words$word)
}

##Word Distribution Plot

line_wc <- sampled %>%
unnest(Data) %>%
mutate(Words_in_Line = str_count(text, "\\S+"))

ggplot(line_wc, aes(Words_in_Line)) +
geom_histogram(binwidth = 2, fill = "steelblue") +
facet_wrap(~ Dataset, scales = "free_y") +
labs(title = "Word Distribution per Line by Dataset",
x = "Words per Line", y = "Frequency")

##Most Common Words (Stopwords Removed)

top_words <- sampled %>%
unnest(Data) %>%
tokenize_words() %>%
count(Dataset, word, sort = TRUE) %>%
group_by(Dataset) %>%
slice_max(n, n = 15) %>%
ungroup()


ggplot(top_words, aes(n, reorder(word, n))) +
geom_col(fill = "orange") +
facet_wrap(~ Dataset, scales = "free") +
labs(title = "Top 15 Common Words (Excluding Stopwords)",
x = "Frequency", y = "")

##Conclusion

The datasets were successfully loaded and profiled. Twitter contains short informal messages, blogs are longer and descriptive, and news is formal. These findings support building an n-gram (unigram/bigram/trigram) model with a simple backoff strategy for next-word prediction, to be deployed as a Shiny web application.

Exploratory Analysis of Text Data for AI Text Prediction

Nikhil Vijayapuri

Introduction