2. Data Loading and Summary

library(tidyverse)
library(tidytext)

# File paths (adjust to your system if needed)
blogs_file   <- "en_US.blogs.txt"
news_file    <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"

# Read lines
blogs_lines   <- read_lines(blogs_file)
news_lines    <- read_lines(news_file)
twitter_lines <- read_lines(twitter_file)

# Compute basic stats
stats <- tibble(
  source = c("blogs", "news", "twitter"),
  lines  = c(length(blogs_lines), length(news_lines), length(twitter_lines)),
  words  = c(
    sum(str_count(blogs_lines, "\\S+")),
    sum(str_count(news_lines, "\\S+")),
    sum(str_count(twitter_lines, "\\S+"))
  )
)
knitr::kable(stats, caption = "Line and Word Counts per File")

Line and Word Counts per File
source	lines	words
blogs	899288	37334131
news	1010242	34372530
twitter	2360148	30373543

hist(nchar(blogs_lines), breaks=50, main="Blog Line Lengths", xlab="Characters")

tw_words <- tibble(text = twitter_lines) %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  slice(1:20)

ggplot(tw_words, aes(reorder(word, n), n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 Twitter Words", x = "Word", y = "Count")

SwiftKey Capstone Milestone Report

H. G. Abhishek

2025-07-17

1. Introduction

2. Data Loading and Summary