1. Introduction

This milestone report shows that I have successfully loaded, explored, and summarized the SwiftKey training data (blogs, news, Twitter). Below are key statistics, plots, and initial findings. Feedback on my plan for the predictive model and Shiny app is welcome!

2. Data Loading and Summary

library(tidyverse)
library(tidytext)

# File paths (adjust to your system if needed)
blogs_file   <- "en_US.blogs.txt"
news_file    <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"

# Read lines
blogs_lines   <- read_lines(blogs_file)
news_lines    <- read_lines(news_file)
twitter_lines <- read_lines(twitter_file)

# Compute basic stats
stats <- tibble(
  source = c("blogs", "news", "twitter"),
  lines  = c(length(blogs_lines), length(news_lines), length(twitter_lines)),
  words  = c(
    sum(str_count(blogs_lines, "\\S+")),
    sum(str_count(news_lines, "\\S+")),
    sum(str_count(twitter_lines, "\\S+"))
  )
)
knitr::kable(stats, caption = "Line and Word Counts per File")
Line and Word Counts per File
source lines words
blogs 899288 37334131
news 1010242 34372530
twitter 2360148 30373543
hist(nchar(blogs_lines), breaks=50, main="Blog Line Lengths", xlab="Characters")

tw_words <- tibble(text = twitter_lines) %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  slice(1:20)

ggplot(tw_words, aes(reorder(word, n), n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 Twitter Words", x = "Word", y = "Count")