Milestone Report – Text Prediction Project

# Load required libraries
library(dplyr)
library(stringi)
library(ggplot2)
library(wordcloud)
library(tm)

# --- Set working directory ---
data_dir <- "C:/Users/Soniamannepuli/OneDrive/Documents/Coursera-SwiftKey/final/en_US"
if (!dir.exists(data_dir)) stop("Data folder not found! Please check your path.")
setwd(data_dir)

# --- Load datasets ---
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

# --- Summaries ---
file_summary <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(sum(stri_count_words(blogs)),
            sum(stri_count_words(news)),
            sum(stri_count_words(twitter)))
)

print(file_summary)

##      File   Lines    Words
## 1   Blogs  899288 37546806
## 2    News 1010206 34761151
## 3 Twitter 2360148 30096690

knitr::kable(file_summary, caption = "Basic Summary of SwiftKey Datasets")

Basic Summary of SwiftKey Datasets
File	Lines	Words
Blogs	899288	37546806
News	1010206	34761151
Twitter	2360148	30096690

set.seed(123)
sample_data <- c(sample(blogs, 2000),
                 sample(news, 2000),
                 sample(twitter, 2000))

# Clean corpus
corpus <- VCorpus(VectorSource(sample_data))
corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace)

# Token frequency
tdm <- TermDocumentMatrix(corpus)
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
freq_df <- data.frame(word = names(freq), freq = freq)
head(freq_df)

##      word freq
## the   the 8666
## and   and 4525
## that that 1911
## for   for 1873
## with with 1289
## you   you 1283

ggplot(freq_df[1:20, ], aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
  theme_minimal()

wordcloud(words = freq_df$word, freq = freq_df$freq, max.words = 100, colors = brewer.pal(8, "Dark2"))

Milestone Report – Text Prediction Project

Sonia Mannepuli