This report demonstrates that the SwiftKey English text data has been downloaded, loaded, summarized, and visualized, and outlines a concise plan for a next-word prediction algorithm and Shiny app.
##Download & List Files
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, destfile = "Coursera-SwiftKey.zip", mode = "wb")
unzip("Coursera-SwiftKey.zip")
}
data_dir <- "final/en_US"
files <- tibble(
Dataset = c("Twitter", "Blogs", "News"),
Path = file.path(data_dir, c("en_US.twitter.txt","en_US.blogs.txt","en_US.news.txt"))
)
files
## # A tibble: 3 × 2
## Dataset Path
## <chr> <chr>
## 1 Twitter final/en_US/en_US.twitter.txt
## 2 Blogs final/en_US/en_US.blogs.txt
## 3 News final/en_US/en_US.news.txt
##Summary Statistics
# Function to count words efficiently
word_count_file <- function(fp, chunk = 100000){
con <- file(fp, open = "r"); on.exit(close(con))
total <- 0L
repeat{
x <- readLines(con, n = chunk, warn = FALSE, skipNul = TRUE)
if (length(x) == 0) break
total <- total + sum(stringr::str_count(x, "\\S+"))
}
total
}
# Generate summary statistics
summaries <- files %>%
rowwise() %>%
mutate(
Size_MB = round(file.info(Path)$size / (1024^2), 2),
Lines = R.utils::countLines(Path),
Words = word_count_file(Path),
Avg_Words_Per_Line = round(Words / Lines, 2)
) %>%
ungroup()
summaries
## # A tibble: 3 × 6
## Dataset Path Size_MB Lines Words Avg_Words_Per_Line
## <chr> <chr> <dbl> <int> <int> <dbl>
## 1 Twitter final/en_US/en_US.twitter.txt 159. 2.36e6 3.04e7 12.9
## 2 Blogs final/en_US/en_US.blogs.txt 200. 8.99e5 3.73e7 41.5
## 3 News final/en_US/en_US.news.txt 196. 1.01e6 3.44e7 34.0
##Sample & Clean
read_sample <- function(fp, max_lines = 50000){
con <- file(fp, "r"); on.exit(close(con))
tibble(text = readLines(con, n = max_lines, warn = FALSE, skipNul = TRUE))
}
# Apply sampling
sampled <- files %>% mutate(Data = lapply(Path, read_sample))
sampled
## # A tibble: 3 × 3
## Dataset Path Data
## <chr> <chr> <list>
## 1 Twitter final/en_US/en_US.twitter.txt <tibble [50,000 × 1]>
## 2 Blogs final/en_US/en_US.blogs.txt <tibble [50,000 × 1]>
## 3 News final/en_US/en_US.news.txt <tibble [50,000 × 1]>
##Cleaning & tokenization helpers
data("stop_words")
clean_text <- function(text){
text %>%
str_to_lower() %>%
str_replace_all("https?://\\S+|www\\.\\S+", " ") %>%
str_replace_all("@\\w+|[^a-z'\\s]", " ") %>%
str_squish()
}
tokenize_words <- function(df){
df %>%
mutate(text = clean_text(text)) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
}
##Word Distribution Plot
line_wc <- sampled %>%
unnest(Data) %>%
mutate(Words_in_Line = str_count(text, "\\S+"))
ggplot(line_wc, aes(Words_in_Line)) +
geom_histogram(binwidth = 2, fill = "steelblue") +
facet_wrap(~ Dataset, scales = "free_y") +
labs(title = "Word Distribution per Line by Dataset",
x = "Words per Line", y = "Frequency")
##Most Common Words (Stopwords Removed)
top_words <- sampled %>%
unnest(Data) %>%
tokenize_words() %>%
count(Dataset, word, sort = TRUE) %>%
group_by(Dataset) %>%
slice_max(n, n = 15) %>%
ungroup()
ggplot(top_words, aes(n, reorder(word, n))) +
geom_col(fill = "orange") +
facet_wrap(~ Dataset, scales = "free") +
labs(title = "Top 15 Common Words (Excluding Stopwords)",
x = "Frequency", y = "")
##Conclusion
The datasets were successfully loaded and profiled. Twitter contains short informal messages, blogs are longer and descriptive, and news is formal. These findings support building an n-gram (unigram/bigram/trigram) model with a simple backoff strategy for next-word prediction, to be deployed as a Shiny web application.