This milestone report shows that I have successfully loaded, explored, and summarized the SwiftKey training data (blogs, news, Twitter). Below are key statistics, plots, and initial findings. Feedback on my plan for the predictive model and Shiny app is welcome!
library(tidyverse)
library(tidytext)
# File paths (adjust to your system if needed)
blogs_file <- "en_US.blogs.txt"
news_file <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"
# Read lines
blogs_lines <- read_lines(blogs_file)
news_lines <- read_lines(news_file)
twitter_lines <- read_lines(twitter_file)
# Compute basic stats
stats <- tibble(
source = c("blogs", "news", "twitter"),
lines = c(length(blogs_lines), length(news_lines), length(twitter_lines)),
words = c(
sum(str_count(blogs_lines, "\\S+")),
sum(str_count(news_lines, "\\S+")),
sum(str_count(twitter_lines, "\\S+"))
)
)
knitr::kable(stats, caption = "Line and Word Counts per File")
| source | lines | words |
|---|---|---|
| blogs | 899288 | 37334131 |
| news | 1010242 | 34372530 |
| 2360148 | 30373543 |
hist(nchar(blogs_lines), breaks=50, main="Blog Line Lengths", xlab="Characters")
tw_words <- tibble(text = twitter_lines) %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE) %>%
slice(1:20)
ggplot(tw_words, aes(reorder(word, n), n)) +
geom_col() +
coord_flip() +
labs(title = "Top 20 Twitter Words", x = "Word", y = "Count")